Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix backend problems #449

Merged
merged 10 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
ci:
skip: [eslint]
skip: [eslint, pyright]
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
Expand All @@ -17,6 +17,12 @@ repos:
args: [--fix]
# Run the formatter.
- id: ruff-format
- repo: local
hooks:
- id: pyright
name: "Pyright"
entry: bash -c 'ENV_NAME=dats source backend/_activate_current_env.sh && pyright'
language: system
- repo: https://github.com/pre-commit/mirrors-eslint
rev: v9.11.0
hooks:
Expand Down
1 change: 1 addition & 0 deletions backend/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies:
- pre-commit=3.3.3
- psycopg2-binary=2.9
- pydantic=2.5.3
- pyright=1.1.385
- pytest=7.4.3
- python-jose=3.3
- python-magic=0.4
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""memo updated and created columns are now non-nullable
Revision ID: 7393d4227260
Revises: b2e6991379f5
Create Date: 2024-10-17 20:14:45.219331

"""

from typing import Sequence, Union

from sqlalchemy.dialects import postgresql

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "7393d4227260"
down_revision: Union[str, None] = "b2e6991379f5"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.alter_column(
"memo",
"created",
existing_type=postgresql.TIMESTAMP(),
nullable=False,
existing_server_default="now()",
)
op.alter_column(
"memo",
"updated",
existing_type=postgresql.TIMESTAMP(),
nullable=False,
existing_server_default="now()",
)


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.alter_column(
"memo",
"updated",
existing_type=postgresql.TIMESTAMP(),
nullable=True,
existing_server_default="now()",
)
op.alter_column(
"memo",
"created",
existing_type=postgresql.TIMESTAMP(),
nullable=True,
existing_server_default="now()",
)
# ### end Alembic commands ###
47 changes: 39 additions & 8 deletions backend/src/app/celery/background_jobs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from pathlib import Path
from typing import Any, List

from celery import Task

from app.core.data.crawler.crawler_service import CrawlerService
from app.core.data.dto.crawler_job import CrawlerJobParameters, CrawlerJobRead
from app.core.data.dto.export_job import ExportJobParameters, ExportJobRead
Expand All @@ -15,6 +17,8 @@ def start_cota_refinement_job_async(
) -> None:
from app.celery.background_jobs.tasks import start_cota_refinement_job_task

assert isinstance(start_cota_refinement_job_task, Task), "Not a Celery Task"

start_cota_refinement_job_task.apply_async(kwargs={"cota_job_id": cota_job_id})


Expand All @@ -23,6 +27,8 @@ def start_trainer_job_async(
) -> None:
from app.celery.background_jobs.tasks import start_trainer_job_task

assert isinstance(start_trainer_job_task, Task), "Not a Celery Task"

start_trainer_job_task.apply_async(kwargs={"trainer_job_id": trainer_job_id})


Expand All @@ -31,6 +37,8 @@ def import_uploaded_archive_apply_async(
) -> Any:
from app.celery.background_jobs.tasks import import_uploaded_archive

assert isinstance(import_uploaded_archive, Task), "Not a Celery Task"

return import_uploaded_archive.apply_async(
kwargs={"archive_file_path_and_project_id": (archive_file_path, project_id)},
)
Expand All @@ -41,6 +49,8 @@ def prepare_and_start_export_job_async(
) -> ExportJobRead:
from app.celery.background_jobs.tasks import start_export_job

assert isinstance(start_export_job, Task), "Not a Celery Task"

exs: ExportService = ExportService()
ex_job = exs.prepare_export_job(export_params)
start_export_job.apply_async(kwargs={"export_job": ex_job})
Expand All @@ -55,16 +65,19 @@ def prepare_and_start_crawling_job_async(
start_crawler_job,
)

assert isinstance(start_crawler_job, Task), "Not a Celery Task"
assert isinstance(import_uploaded_archive, Task), "Not a Celery Task"

cs: CrawlerService = CrawlerService()
cj = cs.prepare_crawler_job(crawler_params)
start_export_job_chain = (
# crawl the data via scrapy and zip the data
start_crawler_job.signature(kwargs={"crawler_job": cj})
|
# import the zip
# TODO create a PPJ for the import
import_uploaded_archive.signature()
)

job1 = start_crawler_job.signature(kwargs={"crawler_job": cj})
job2 = import_uploaded_archive.signature()

assert job1 is not None, "Job 1 is None"
assert job2 is not None, "Job 2 is None"

start_export_job_chain = job1 | job2
start_export_job_chain.apply_async()

return cj
Expand All @@ -75,6 +88,8 @@ def prepare_and_start_llm_job_async(
) -> LLMJobRead:
from app.celery.background_jobs.tasks import start_llm_job

assert isinstance(start_llm_job, Task), "Not a Celery Task"

llms: LLMService = LLMService()
llm_job = llms.prepare_llm_job(llm_job_params)
start_llm_job.apply_async(kwargs={"llm_job": llm_job})
Expand All @@ -88,6 +103,10 @@ def execute_text_preprocessing_pipeline_apply_async(
execute_text_preprocessing_pipeline_task,
)

assert isinstance(
execute_text_preprocessing_pipeline_task, Task
), "Not a Celery Task"

for cargo in cargos:
execute_text_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo})

Expand All @@ -99,6 +118,10 @@ def execute_image_preprocessing_pipeline_apply_async(
execute_image_preprocessing_pipeline_task,
)

assert isinstance(
execute_image_preprocessing_pipeline_task, Task
), "Not a Celery Task"

for cargo in cargos:
execute_image_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo})

Expand All @@ -110,6 +133,10 @@ def execute_audio_preprocessing_pipeline_apply_async(
execute_audio_preprocessing_pipeline_task,
)

assert isinstance(
execute_audio_preprocessing_pipeline_task, Task
), "Not a Celery Task"

for cargo in cargos:
execute_audio_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo})

Expand All @@ -121,5 +148,9 @@ def execute_video_preprocessing_pipeline_apply_async(
execute_video_preprocessing_pipeline_task,
)

assert isinstance(
execute_video_preprocessing_pipeline_task, Task
), "Not a Celery Task"

for cargo in cargos:
execute_video_preprocessing_pipeline_task.apply_async(kwargs={"cargo": cargo})
4 changes: 2 additions & 2 deletions backend/src/app/core/analysis/analysis_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,14 +361,14 @@ def sample_sdocs_by_tags(
sample_fixed = (
df.groupby(by=list(groups))
.sample(n=min(n, min_count))
.groupby(by=list(groups))["sdoc"]
.groupby(by=list(groups))["sdoc"] # type: ignore
.apply(list)
.to_dict()
)
sample_relative = (
df.groupby(by=list(groups))
.sample(frac=frac)
.groupby(by=list(groups))["sdoc"]
.groupby(by=list(groups))["sdoc"] # type: ignore
.apply(list)
.to_dict()
)
Expand Down
2 changes: 1 addition & 1 deletion backend/src/app/core/analysis/timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def timeline_analysis(

query = db.query(
sdoc_ids_agg,
*group_by.apply(subquery.c[1]), # type: ignore
*group_by.apply(subquery.c["date"]), # type: ignore
).join(subquery, SourceDocumentORM.id == subquery.c.id)

query = apply_filtering(
Expand Down
7 changes: 0 additions & 7 deletions backend/src/app/core/data/repo/repo_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,6 @@ def __init__(self, dst_path: Path):
)


class UnsupportedDocTypeForMimeType(Exception):
def __init__(self, mime_type: str):
super().__init__(
f"Unsupported DocType! Cannot infer DocType from MimeType '{mime_type}'."
)


class ErroneousArchiveException(Exception):
def __init__(self, archive_path: Path):
super().__init__(f"Error with Archive {archive_path}")
Expand Down
3 changes: 1 addition & 2 deletions backend/src/app/core/filters/filtering.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
NumberOperator,
StringOperator,
)
from app.core.filters.types import FilterValue


class LogicalOperator(str, Enum):
Expand All @@ -36,8 +37,6 @@ def get_sqlalchemy_operator(self):

T = TypeVar("T", bound=AbstractColumns)

FilterValue = Union[bool, str, int, List[str], List[List[str]]]


class FilterExpression(BaseModel, Generic[T]):
id: str
Expand Down
2 changes: 1 addition & 1 deletion backend/src/app/core/filters/filtering_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from sqlalchemy import not_
from sqlalchemy.orm import QueryableAttribute

from app.core.filters.filtering import FilterValue
from app.core.filters.types import FilterValue


class FilterValueType(Enum):
Expand Down
3 changes: 3 additions & 0 deletions backend/src/app/core/filters/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from typing import List, Union

FilterValue = Union[bool, str, int, List[str], List[List[str]]]
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import re
from html.parser import HTMLParser
from itertools import accumulate
from typing import Dict, List, Union
from typing import List, Optional, TypedDict

from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
from app.preprocessing.pipeline.model.text.preprotextdoc import PreProTextDoc


class Text(TypedDict):
text: str
start: int
end: int


class CustomLineHTMLParser(HTMLParser):
result: List[Dict[str, Union[str, int]]]
result: List[Text]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -23,7 +29,7 @@ def current_index(self):
line, char = self.getpos()
return self.line_lengths[line - 1] + char

def __call__(self, data: str) -> List[Dict[str, Union[str, int]]]:
def __call__(self, data: str) -> List[Text]:
self.reset()
self.line_lengths = [0] + list(
accumulate(len(line) for line in data.splitlines(keepends=True))
Expand All @@ -37,21 +43,13 @@ class HTMLTextMapper(CustomLineHTMLParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.result = []
self.text = {
"text": "",
"start": 0,
"end": 0,
}
self.text: Optional[Text] = None
self.end_spaces = 0

def reset(self):
super().reset()
self.result = []
self.text = {
"text": "",
"start": 0,
"end": 0,
}
self.text = None

def handle_data(self, data: str):
# only add text if it is not only whitespaces!
Expand All @@ -68,6 +66,7 @@ def handle_data(self, data: str):
self.text = {
"text": data.strip(),
"start": self.current_index + start_spaces,
"end": -1,
}

def handle_starttag(self, tag, attrs):
Expand All @@ -80,14 +79,11 @@ def handle_comment(self, data):
self.text_end()

def text_end(self):
self.text["end"] = self.current_index - self.end_spaces
self.result.append(self.text)
self.text = {
"text": "",
"start": 0,
"end": 0,
}
self.end_spaces = 0
if self.text:
self.text["end"] = self.current_index - self.end_spaces
self.result.append(self.text)
self.text = None
self.end_spaces = 0

def close(self):
super().close()
Expand Down
8 changes: 7 additions & 1 deletion backend/src/app/preprocessing/preprocessing_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from app.core.data.repo.repo_service import (
FileNotFoundInRepositoryError,
RepoService,
UnsupportedDocTypeForMimeType,
UnsupportedDocTypeForSourceDocument,
)
from app.core.db.sql_service import SQLService
Expand All @@ -40,6 +39,13 @@
from app.util.singleton_meta import SingletonMeta


class UnsupportedDocTypeForMimeType(Exception):
def __init__(self, mime_type: str):
super().__init__(
f"Unsupported DocType! Cannot infer DocType from MimeType '{mime_type}'."
)


class PreprocessingService(metaclass=SingletonMeta):
def __new__(cls, *args, **kwargs):
cls.sqls: SQLService = SQLService()
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/api/openapi/models/ExportJobParameters.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ export type ExportJobParameters = {
/**
* The format of the exported data.
*/
export_format?: ExportFormat | null;
export_format?: ExportFormat;
/**
* Specific parameters for the export job w.r.t it's type
*/
Expand Down
Loading
Loading