diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 740fdb84c..94906dbce 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ ci: - skip: [eslint] + skip: [eslint, pyright] repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 @@ -17,6 +17,12 @@ repos: args: [--fix] # Run the formatter. - id: ruff-format + - repo: local + hooks: + - id: pyright + name: "Pyright" + entry: bash -c 'ENV_NAME=dats source backend/_activate_current_env.sh && pyright' + language: system - repo: https://github.com/pre-commit/mirrors-eslint rev: v9.11.0 hooks: diff --git a/backend/environment.yml b/backend/environment.yml index 2e9c15cfd..014cab45a 100644 --- a/backend/environment.yml +++ b/backend/environment.yml @@ -33,6 +33,7 @@ dependencies: - pre-commit=3.3.3 - psycopg2-binary=2.9 - pydantic=2.5.3 + - pyright=1.1.385 - pytest=7.4.3 - python-jose=3.3 - python-magic=0.4 diff --git a/backend/src/alembic/versions/7393d4227260_memo_updated_non_nullable.py b/backend/src/alembic/versions/7393d4227260_memo_updated_non_nullable.py new file mode 100644 index 000000000..9ca46eac8 --- /dev/null +++ b/backend/src/alembic/versions/7393d4227260_memo_updated_non_nullable.py @@ -0,0 +1,54 @@ +"""memo updated and created columns are now non-nullable +Revision ID: 7393d4227260 +Revises: b2e6991379f5 +Create Date: 2024-10-17 20:14:45.219331 + +""" + +from typing import Sequence, Union + +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "7393d4227260" +down_revision: Union[str, None] = "b2e6991379f5" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.alter_column( + "memo", + "created", + existing_type=postgresql.TIMESTAMP(), + nullable=False, + existing_server_default="now()", + ) + op.alter_column( + "memo", + "updated", + existing_type=postgresql.TIMESTAMP(), + nullable=False, + existing_server_default="now()", + ) + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "memo", + "updated", + existing_type=postgresql.TIMESTAMP(), + nullable=True, + existing_server_default="now()", + ) + op.alter_column( + "memo", + "created", + existing_type=postgresql.TIMESTAMP(), + nullable=True, + existing_server_default="now()", + ) + # ### end Alembic commands ### diff --git a/backend/src/app/celery/background_jobs/__init__.py b/backend/src/app/celery/background_jobs/__init__.py index 6572f5984..b98e577d6 100644 --- a/backend/src/app/celery/background_jobs/__init__.py +++ b/backend/src/app/celery/background_jobs/__init__.py @@ -1,6 +1,8 @@ from pathlib import Path from typing import Any, List +from celery import Task + from app.core.data.crawler.crawler_service import CrawlerService from app.core.data.dto.crawler_job import CrawlerJobParameters, CrawlerJobRead from app.core.data.dto.export_job import ExportJobParameters, ExportJobRead @@ -15,6 +17,8 @@ def start_cota_refinement_job_async( ) -> None: from app.celery.background_jobs.tasks import start_cota_refinement_job_task + assert isinstance(start_cota_refinement_job_task, Task), "Not a Celery Task" + start_cota_refinement_job_task.apply_async(kwargs={"cota_job_id": cota_job_id}) @@ -23,6 +27,8 @@ def start_trainer_job_async( ) -> None: from app.celery.background_jobs.tasks import start_trainer_job_task + assert isinstance(start_trainer_job_task, Task), "Not a Celery Task" + start_trainer_job_task.apply_async(kwargs={"trainer_job_id": trainer_job_id}) @@ -31,6 +37,8 @@ def import_uploaded_archive_apply_async( ) -> Any: from app.celery.background_jobs.tasks import import_uploaded_archive + assert isinstance(import_uploaded_archive, Task), "Not a Celery Task" + return import_uploaded_archive.apply_async( kwargs={"archive_file_path_and_project_id": (archive_file_path, project_id)}, ) @@ -41,6 +49,8 @@ def prepare_and_start_export_job_async( ) -> ExportJobRead: from app.celery.background_jobs.tasks import start_export_job + assert isinstance(start_export_job, Task), "Not a Celery Task" + exs: ExportService = ExportService() ex_job = exs.prepare_export_job(export_params) start_export_job.apply_async(kwargs={"export_job": ex_job}) @@ -55,16 +65,19 @@ def prepare_and_start_crawling_job_async( start_crawler_job, ) + assert isinstance(start_crawler_job, Task), "Not a Celery Task" + assert isinstance(import_uploaded_archive, Task), "Not a Celery Task" + cs: CrawlerService = CrawlerService() cj = cs.prepare_crawler_job(crawler_params) - start_export_job_chain = ( - # crawl the data via scrapy and zip the data - start_crawler_job.signature(kwargs={"crawler_job": cj}) - | - # import the zip - # TODO create a PPJ for the import - import_uploaded_archive.signature() - ) + + job1 = start_crawler_job.signature(kwargs={"crawler_job": cj}) + job2 = import_uploaded_archive.signature() + + assert job1 is not None, "Job 1 is None" + assert job2 is not None, "Job 2 is None" + + start_export_job_chain = job1 | job2 start_export_job_chain.apply_async() return cj @@ -75,6 +88,8 @@ def prepare_and_start_llm_job_async( ) -> LLMJobRead: from app.celery.background_jobs.tasks import start_llm_job + assert isinstance(start_llm_job, Task), "Not a Celery Task" + llms: LLMService = LLMService() llm_job = llms.prepare_llm_job(llm_job_params) start_llm_job.apply_async(kwargs={"llm_job": llm_job}) @@ -88,6 +103,10 @@ def execute_text_preprocessing_pipeline_apply_async( execute_text_preprocessing_pipeline_task, ) + assert isinstance( + execute_text_preprocessing_pipeline_task, Task + ), "Not a Celery Task" + for cargo in cargos: execute_text_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo}) @@ -99,6 +118,10 @@ def execute_image_preprocessing_pipeline_apply_async( execute_image_preprocessing_pipeline_task, ) + assert isinstance( + execute_image_preprocessing_pipeline_task, Task + ), "Not a Celery Task" + for cargo in cargos: execute_image_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo}) @@ -110,6 +133,10 @@ def execute_audio_preprocessing_pipeline_apply_async( execute_audio_preprocessing_pipeline_task, ) + assert isinstance( + execute_audio_preprocessing_pipeline_task, Task + ), "Not a Celery Task" + for cargo in cargos: execute_audio_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo}) @@ -121,5 +148,9 @@ def execute_video_preprocessing_pipeline_apply_async( execute_video_preprocessing_pipeline_task, ) + assert isinstance( + execute_video_preprocessing_pipeline_task, Task + ), "Not a Celery Task" + for cargo in cargos: execute_video_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo}) diff --git a/backend/src/app/core/analysis/analysis_service.py b/backend/src/app/core/analysis/analysis_service.py index 83561dbe2..f33c6944d 100644 --- a/backend/src/app/core/analysis/analysis_service.py +++ b/backend/src/app/core/analysis/analysis_service.py @@ -361,14 +361,14 @@ def sample_sdocs_by_tags( sample_fixed = ( df.groupby(by=list(groups)) .sample(n=min(n, min_count)) - .groupby(by=list(groups))["sdoc"] + .groupby(by=list(groups))["sdoc"] # type: ignore .apply(list) .to_dict() ) sample_relative = ( df.groupby(by=list(groups)) .sample(frac=frac) - .groupby(by=list(groups))["sdoc"] + .groupby(by=list(groups))["sdoc"] # type: ignore .apply(list) .to_dict() ) diff --git a/backend/src/app/core/analysis/timeline.py b/backend/src/app/core/analysis/timeline.py index e26daf9f0..2936df79d 100644 --- a/backend/src/app/core/analysis/timeline.py +++ b/backend/src/app/core/analysis/timeline.py @@ -166,7 +166,7 @@ def timeline_analysis( query = db.query( sdoc_ids_agg, - *group_by.apply(subquery.c[1]), # type: ignore + *group_by.apply(subquery.c["date"]), # type: ignore ).join(subquery, SourceDocumentORM.id == subquery.c.id) query = apply_filtering( diff --git a/backend/src/app/core/data/repo/repo_service.py b/backend/src/app/core/data/repo/repo_service.py index f986e1491..18a44cae7 100644 --- a/backend/src/app/core/data/repo/repo_service.py +++ b/backend/src/app/core/data/repo/repo_service.py @@ -66,13 +66,6 @@ def __init__(self, dst_path: Path): ) -class UnsupportedDocTypeForMimeType(Exception): - def __init__(self, mime_type: str): - super().__init__( - f"Unsupported DocType! Cannot infer DocType from MimeType '{mime_type}'." - ) - - class ErroneousArchiveException(Exception): def __init__(self, archive_path: Path): super().__init__(f"Error with Archive {archive_path}") diff --git a/backend/src/app/core/filters/filtering.py b/backend/src/app/core/filters/filtering.py index 029a1c1bf..63b383b71 100644 --- a/backend/src/app/core/filters/filtering.py +++ b/backend/src/app/core/filters/filtering.py @@ -18,6 +18,7 @@ NumberOperator, StringOperator, ) +from app.core.filters.types import FilterValue class LogicalOperator(str, Enum): @@ -36,8 +37,6 @@ def get_sqlalchemy_operator(self): T = TypeVar("T", bound=AbstractColumns) -FilterValue = Union[bool, str, int, List[str], List[List[str]]] - class FilterExpression(BaseModel, Generic[T]): id: str diff --git a/backend/src/app/core/filters/filtering_operators.py b/backend/src/app/core/filters/filtering_operators.py index 1bf57be37..28dd3cf48 100644 --- a/backend/src/app/core/filters/filtering_operators.py +++ b/backend/src/app/core/filters/filtering_operators.py @@ -3,7 +3,7 @@ from sqlalchemy import not_ from sqlalchemy.orm import QueryableAttribute -from app.core.filters.filtering import FilterValue +from app.core.filters.types import FilterValue class FilterValueType(Enum): diff --git a/backend/src/app/core/filters/types.py b/backend/src/app/core/filters/types.py new file mode 100644 index 000000000..a6640aeda --- /dev/null +++ b/backend/src/app/core/filters/types.py @@ -0,0 +1,3 @@ +from typing import List, Union + +FilterValue = Union[bool, str, int, List[str], List[List[str]]] diff --git a/backend/src/app/preprocessing/pipeline/steps/text/extract_text_from_html_and_create_source_mapping.py b/backend/src/app/preprocessing/pipeline/steps/text/extract_text_from_html_and_create_source_mapping.py index 798b1efc0..887272b3a 100644 --- a/backend/src/app/preprocessing/pipeline/steps/text/extract_text_from_html_and_create_source_mapping.py +++ b/backend/src/app/preprocessing/pipeline/steps/text/extract_text_from_html_and_create_source_mapping.py @@ -1,14 +1,20 @@ import re from html.parser import HTMLParser from itertools import accumulate -from typing import Dict, List, Union +from typing import List, Optional, TypedDict from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc +class Text(TypedDict): + text: str + start: int + end: int + + class CustomLineHTMLParser(HTMLParser): - result: List[Dict[str, Union[str, int]]] + result: List[Text] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -23,7 +29,7 @@ def current_index(self): line, char = self.getpos() return self.line_lengths[line - 1] + char - def __call__(self, data: str) -> List[Dict[str, Union[str, int]]]: + def __call__(self, data: str) -> List[Text]: self.reset() self.line_lengths = [0] + list( accumulate(len(line) for line in data.splitlines(keepends=True)) @@ -37,21 +43,13 @@ class HTMLTextMapper(CustomLineHTMLParser): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.result = [] - self.text = { - "text": "", - "start": 0, - "end": 0, - } + self.text: Optional[Text] = None self.end_spaces = 0 def reset(self): super().reset() self.result = [] - self.text = { - "text": "", - "start": 0, - "end": 0, - } + self.text = None def handle_data(self, data: str): # only add text if it is not only whitespaces! @@ -68,6 +66,7 @@ def handle_data(self, data: str): self.text = { "text": data.strip(), "start": self.current_index + start_spaces, + "end": -1, } def handle_starttag(self, tag, attrs): @@ -80,14 +79,11 @@ def handle_comment(self, data): self.text_end() def text_end(self): - self.text["end"] = self.current_index - self.end_spaces - self.result.append(self.text) - self.text = { - "text": "", - "start": 0, - "end": 0, - } - self.end_spaces = 0 + if self.text: + self.text["end"] = self.current_index - self.end_spaces + self.result.append(self.text) + self.text = None + self.end_spaces = 0 def close(self): super().close() diff --git a/backend/src/app/preprocessing/preprocessing_service.py b/backend/src/app/preprocessing/preprocessing_service.py index 946e3b484..23a0a6302 100644 --- a/backend/src/app/preprocessing/preprocessing_service.py +++ b/backend/src/app/preprocessing/preprocessing_service.py @@ -31,7 +31,6 @@ from app.core.data.repo.repo_service import ( FileNotFoundInRepositoryError, RepoService, - UnsupportedDocTypeForMimeType, UnsupportedDocTypeForSourceDocument, ) from app.core.db.sql_service import SQLService @@ -40,6 +39,13 @@ from app.util.singleton_meta import SingletonMeta +class UnsupportedDocTypeForMimeType(Exception): + def __init__(self, mime_type: str): + super().__init__( + f"Unsupported DocType! Cannot infer DocType from MimeType '{mime_type}'." + ) + + class PreprocessingService(metaclass=SingletonMeta): def __new__(cls, *args, **kwargs): cls.sqls: SQLService = SQLService() diff --git a/frontend/src/api/openapi/models/ExportJobParameters.ts b/frontend/src/api/openapi/models/ExportJobParameters.ts index 2df09c86b..56327df1b 100644 --- a/frontend/src/api/openapi/models/ExportJobParameters.ts +++ b/frontend/src/api/openapi/models/ExportJobParameters.ts @@ -22,7 +22,7 @@ export type ExportJobParameters = { /** * The format of the exported data. */ - export_format?: ExportFormat | null; + export_format?: ExportFormat; /** * Specific parameters for the export job w.r.t it's type */ diff --git a/frontend/src/openapi.json b/frontend/src/openapi.json index a17d469f1..72bc424a5 100644 --- a/frontend/src/openapi.json +++ b/frontend/src/openapi.json @@ -7238,7 +7238,7 @@ "description": "The type of the export job (what to export)" }, "export_format": { - "anyOf": [{ "$ref": "#/components/schemas/ExportFormat" }, { "type": "null" }], + "allOf": [{ "$ref": "#/components/schemas/ExportFormat" }], "description": "The format of the exported data.", "default": "CSV" },