uhh-lt · bigabig · Sep 25, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/.github/workflows/backend_check_schema.yml b/.github/workflows/backend_check_schema.yml
@@ -16,6 +16,7 @@ jobs:
     env:
       API_PRODUCTION_WORKERS: 1
       RAY_ENABLED: False
+      OLLAMA_ENABLED: False
       COMPOSE_PROFILES: "background"
     steps:
       - uses: actions/checkout@v3
@@ -25,7 +26,7 @@ jobs:
           ./setup-folders.sh
           cp .env.example .env
           chmod -R a+rwx backend_repo/ models_cache/ spacy_models/
-          python monkey_patch_docker_compose_for_backend_tests.py --disable_ray
+          python monkey_patch_docker_compose_for_backend_tests.py --disable_ray --disable_ollama
           export GID=$(id -g)
           docker compose -f compose-test.yml up -d --quiet-pull postgres
           echo Waiting for containers to start...

diff --git a/.github/workflows/backend_unit_tests.yml b/.github/workflows/backend_unit_tests.yml
@@ -52,6 +52,7 @@ jobs:
       # disable backend and frontend
       COMPOSE_PROFILES: "background"
       RAY_ENABLED: False
+      OLLAMA_ENABLED: False
       POSTGRES_DB: dats-test
       JWT_SECRET: ${{ secrets.JWT_SECRET }}
     steps:
@@ -66,7 +67,7 @@ jobs:
           ./setup-folders.sh
           cp .env.example .env
           chmod -R a+rwx backend_repo/ models_cache/ spacy_models/
-          python monkey_patch_docker_compose_for_backend_tests.py --disable_ray
+          python monkey_patch_docker_compose_for_backend_tests.py --disable_ray --disable_ollama
           export GID=$(id -g)
           docker compose -f compose-test.yml up -d --quiet-pull
           echo Waiting for containers to start...

diff --git a/.github/workflows/update-openapi-spec.yml b/.github/workflows/update-openapi-spec.yml
@@ -18,6 +18,7 @@ jobs:
     env:
       API_PRODUCTION_WORKERS: 1
       RAY_ENABLED: False
+      OLLAMA_ENABLED: False
       API_EXPOSED: 5500
       VITE_APP_SERVER: http://localhost:5500
     steps:
@@ -33,7 +34,7 @@ jobs:
           ./setup-folders.sh
           cp .env.example .env
           chmod -R a+rwx backend_repo/ models_cache/ spacy_models/
-          python monkey_patch_docker_compose_for_backend_tests.py --disable_ray
+          python monkey_patch_docker_compose_for_backend_tests.py --disable_ray --disable_ollama
           export GID=$(id -g)
           docker compose -f compose-test.yml up -d --quiet-pull --wait --wait-timeout 300
           echo Waiting for containers to start...

diff --git a/backend/.env.example b/backend/.env.example
@@ -70,6 +70,11 @@ REDIS_PASSWORD=dats123
 WEAVIATE_HOST=localhost
 WEAVIATE_PORT=13241
 
+OLLAMA_ENABLED=True
+OLLAMA_HOST=localhost
+OLLAMA_PORT=13242
+OLLAMA_MODEL=gemma2:latest
+
 # Mail sending configuration
 MAIL_ENABLED=False
 [email protected]

diff --git a/backend/.env.testing.example b/backend/.env.testing.example
@@ -5,6 +5,7 @@
 # This way, you can keep only necessary overrides in .env.testing
 
 RAY_ENABLED=False
+OLLAMA_ENABLED=False
 POSTGRES_DB=dats-testing
 # These are separate variables from `WEAVIATE_PORT` etc.
 # because we need to spin up separate containers for testing

diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -1,7 +1,7 @@
 # docker build -f Dockerfile -t uhhlt/dats_backend:<version> .
 # docker push uhhlt/dats_backend:<version>
 
-FROM ubuntu:jammy-20221020 as ubuntu
+FROM ubuntu:jammy-20221020 AS ubuntu
 CMD ["/bin/bash"]
 
 # makes CUDA devices visible to the container by default

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -1,4 +1,5 @@
 mammoth==1.6.0
+ollama==0.3.1
 pymupdf==1.23.4
 pytest-order==1.2.1
 Scrapy==2.10.0

diff --git a/backend/src/alembic/versions/45549c9c4ff2_add_project_metadata_description.py b/backend/src/alembic/versions/45549c9c4ff2_add_project_metadata_description.py
@@ -0,0 +1,44 @@
+"""add project metadata description
+
+Revision ID: 45549c9c4ff2
+Revises: 2b91203d1bb6
+Create Date: 2024-09-16 08:41:55.296647
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "45549c9c4ff2"
+down_revision: Union[str, None] = "2b91203d1bb6"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # add new column
+    op.add_column(
+        "projectmetadata", sa.Column("description", sa.String(), nullable=True)
+    )
+
+    # edit all existing rows to have a description
+    op.execute(
+        """
+        UPDATE projectmetadata
+        SET description = 'Placeholder description'
+        WHERE description IS NULL
+        """
+    )
+
+    # make the column not nullable
+    op.alter_column("projectmetadata", "description", nullable=False)
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("projectmetadata", "description")
+    # ### end Alembic commands ###
diff --git a/backend/src/api/endpoints/document_tag.py b/backend/src/api/endpoints/document_tag.py
@@ -14,6 +14,7 @@
     DocumentTagCreate,
     DocumentTagRead,
     DocumentTagUpdate,
+    SourceDocumentDocumentTagLinks,
     SourceDocumentDocumentTagMultiLink,
 )
 from app.core.data.dto.memo import AttachedObjectType, MemoCreate, MemoInDB, MemoRead
@@ -114,6 +115,36 @@ def unlink_multiple_tags(
     )
 
 
+@router.patch(
+    "/bulk/set",
+    response_model=int,
+    summary="Sets SourceDocuments' tags to the provided tags",
+)
+def set_document_tags_batch(
+    *,
+    db: Session = Depends(get_db_session),
+    links: List[SourceDocumentDocumentTagLinks],
+    authz_user: AuthzUser = Depends(),
+    validate: Validate = Depends(),
+) -> int:
+    sdoc_ids = [link.source_document_id for link in links]
+    tag_ids = list(set([tag_id for link in links for tag_id in link.document_tag_ids]))
+    # TODO this is a little inefficient, but at the moment
+    # the fronend is never sending more than one id at a time
+    authz_user.assert_in_same_project_as_many(Crud.SOURCE_DOCUMENT, sdoc_ids)
+    authz_user.assert_in_same_project_as_many(Crud.DOCUMENT_TAG, tag_ids)
+
+    validate.validate_objects_in_same_project(
+        [(Crud.SOURCE_DOCUMENT, sdoc_id) for sdoc_id in sdoc_ids]
+        + [(Crud.DOCUMENT_TAG, tag_id) for tag_id in tag_ids]
+    )
+
+    return crud_document_tag.set_document_tags_batch(
+        db=db,
+        links={link.source_document_id: link.document_tag_ids for link in links},
+    )
+
+
 @router.get(
     "/{tag_id}",
     response_model=DocumentTagRead,

diff --git a/backend/src/api/endpoints/llm.py b/backend/src/api/endpoints/llm.py
@@ -0,0 +1,68 @@
+from typing import List
+
+from fastapi import APIRouter, Depends
+
+from api.dependencies import get_current_user
+from app.celery.background_jobs import prepare_and_start_llm_job_async
+from app.core.authorization.authz_user import AuthzUser
+from app.core.data.dto.llm_job import LLMJobParameters, LLMJobRead, LLMPromptTemplates
+from app.core.data.llm.llm_service import LLMService
+
+router = APIRouter(
+    prefix="/llm", dependencies=[Depends(get_current_user)], tags=["llm"]
+)
+
+llms: LLMService = LLMService()
+
+
+@router.post(
+    "",
+    response_model=LLMJobRead,
+    summary="Returns the LLMJob for the given Parameters",
+)
+def start_llm_job(
+    *, llm_job_params: LLMJobParameters, authz_user: AuthzUser = Depends()
+) -> LLMJobRead:
+    authz_user.assert_in_project(llm_job_params.project_id)
+
+    return prepare_and_start_llm_job_async(llm_job_params=llm_job_params)
+
+
+@router.get(
+    "/{llm_job_id}",
+    response_model=LLMJobRead,
+    summary="Returns the LLMJob for the given ID if it exists",
+)
+def get_llm_job(*, llm_job_id: str, authz_user: AuthzUser = Depends()) -> LLMJobRead:
+    job = llms.get_llm_job(llm_job_id=llm_job_id)
+    authz_user.assert_in_project(job.parameters.project_id)
+
+    return job
+
+
+@router.get(
+    "/project/{project_id}",
+    response_model=List[LLMJobRead],
+    summary="Returns all LLMJobRead for the given project ID if it exists",
+)
+def get_all_llm_jobs(
+    *, project_id: int, authz_user: AuthzUser = Depends()
+) -> List[LLMJobRead]:
+    authz_user.assert_in_project(project_id)
+
+    llm_jobs = llms.get_all_llm_jobs(project_id=project_id)
+    llm_jobs.sort(key=lambda x: x.created, reverse=True)
+    return llm_jobs
+
+
+@router.post(
+    "/create_prompt_templates",
+    response_model=List[LLMPromptTemplates],
+    summary="Returns the system and user prompt templates for the given llm task in all supported languages",
+)
+def create_prompt_templates(
+    *, llm_job_params: LLMJobParameters, authz_user: AuthzUser = Depends()
+) -> List[LLMPromptTemplates]:
+    authz_user.assert_in_project(llm_job_params.project_id)
+
+    return llms.create_prompt_templates(llm_job_params=llm_job_params)
diff --git a/backend/src/api/endpoints/source_document_metadata.py b/backend/src/api/endpoints/source_document_metadata.py
@@ -1,3 +1,5 @@
+from typing import List
+
 from fastapi import APIRouter, Depends
 from sqlalchemy.orm import Session
 
@@ -7,6 +9,7 @@
 from app.core.data.crud import Crud
 from app.core.data.crud.source_document_metadata import crud_sdoc_meta
 from app.core.data.dto.source_document_metadata import (
+    SourceDocumentMetadataBulkUpdate,
     SourceDocumentMetadataCreate,
     SourceDocumentMetadataRead,
     SourceDocumentMetadataReadResolved,
@@ -82,6 +85,27 @@ def update_by_id(
     return SourceDocumentMetadataRead.model_validate(db_obj)
 
 
+@router.patch(
+    "/bulk/update",
+    response_model=List[SourceDocumentMetadataRead],
+    summary="Updates multiple metadata objects at once.",
+)
+def update_bulk(
+    *,
+    db: Session = Depends(get_db_session),
+    metadatas: List[SourceDocumentMetadataBulkUpdate],
+    authz_user: AuthzUser = Depends(),
+) -> List[SourceDocumentMetadataRead]:
+    authz_user.assert_in_same_project_as_many(
+        Crud.SOURCE_DOCUMENT_METADATA, [m.id for m in metadatas]
+    )
+
+    print("HI!")
+
+    db_objs = crud_sdoc_meta.update_bulk(db=db, update_dtos=metadatas)
+    return [SourceDocumentMetadataRead.model_validate(db_obj) for db_obj in db_objs]
+
+
 @router.delete(
     "/{metadata_id}",
     response_model=SourceDocumentMetadataRead,

diff --git a/backend/src/api/endpoints/span_annotation.py b/backend/src/api/endpoints/span_annotation.py
@@ -13,6 +13,7 @@
 from app.core.data.dto.code import CodeRead
 from app.core.data.dto.memo import AttachedObjectType, MemoCreate, MemoInDB, MemoRead
 from app.core.data.dto.span_annotation import (
+    SpanAnnotationCreateBulkWithCodeId,
     SpanAnnotationCreateWithCodeId,
     SpanAnnotationRead,
     SpanAnnotationReadResolved,
@@ -66,6 +67,46 @@ def add_span_annotation(
         return span_dto
 
 
+@router.put(
+    "/bulk/create",
+    response_model=Union[List[SpanAnnotationRead], List[SpanAnnotationReadResolved]],
+    summary="Creates a SpanAnnotations in Bulk",
+)
+def add_span_annotations_bulk(
+    *,
+    db: Session = Depends(get_db_session),
+    spans: List[SpanAnnotationCreateBulkWithCodeId],
+    resolve_code: bool = Depends(resolve_code_param),
+    authz_user: AuthzUser = Depends(),
+    validate: Validate = Depends(),
+) -> Union[List[SpanAnnotationRead], List[SpanAnnotationReadResolved]]:
+    for span in spans:
+        authz_user.assert_in_same_project_as(Crud.CODE, span.code_id)
+        authz_user.assert_in_same_project_as(Crud.SOURCE_DOCUMENT, span.sdoc_id)
+        validate.validate_objects_in_same_project(
+            [
+                (Crud.CODE, span.code_id),
+                (Crud.SOURCE_DOCUMENT, span.sdoc_id),
+            ]
+        )
+
+    db_objs = crud_span_anno.create_bulk(db=db, create_dtos=spans)
+    span_dtos = [SpanAnnotationRead.model_validate(db_obj) for db_obj in db_objs]
+    if resolve_code:
+        return [
+            SpanAnnotationReadResolved(
+                **span_dto.model_dump(exclude={"current_code_id", "span_text_id"}),
+                code=CodeRead.model_validate(db_obj.current_code.code),
+                span_text=db_obj.span_text.text,
+                user_id=db_obj.annotation_document.user_id,
+                sdoc_id=db_obj.annotation_document.source_document_id,
+            )
+            for span_dto, db_obj in zip(span_dtos, db_objs)
+        ]
+    else:
+        return span_dtos
+
+
 @router.get(
     "/{span_id}",
     response_model=Union[SpanAnnotationRead, SpanAnnotationReadResolved],

diff --git a/backend/src/app/celery/background_jobs/__init__.py b/backend/src/app/celery/background_jobs/__init__.py
@@ -4,7 +4,9 @@
 from app.core.data.crawler.crawler_service import CrawlerService
 from app.core.data.dto.crawler_job import CrawlerJobParameters, CrawlerJobRead
 from app.core.data.dto.export_job import ExportJobParameters, ExportJobRead
+from app.core.data.dto.llm_job import LLMJobParameters, LLMJobRead
 from app.core.data.export.export_service import ExportService
+from app.core.data.llm.llm_service import LLMService
 from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
 
 
@@ -76,6 +78,17 @@ def prepare_and_start_crawling_job_async(
     return cj
 
 
+def prepare_and_start_llm_job_async(
+    llm_job_params: LLMJobParameters,
+) -> LLMJobRead:
+    from app.celery.background_jobs.tasks import start_llm_job
+
+    llms: LLMService = LLMService()
+    llm_job = llms.prepare_llm_job(llm_job_params)
+    start_llm_job.apply_async(kwargs={"llm_job": llm_job})
+    return llm_job
+
+
 def execute_text_preprocessing_pipeline_apply_async(
     cargos: List[PipelineCargo],
 ) -> None:

diff --git a/backend/src/app/celery/background_jobs/llm.py b/backend/src/app/celery/background_jobs/llm.py
@@ -0,0 +1,18 @@
+from loguru import logger
+
+from app.core.data.dto.llm_job import LLMJobRead
+from app.core.data.llm.llm_service import LLMService
+
+llms: LLMService = LLMService()
+
+
+def start_llm_job_(llm_job: LLMJobRead) -> None:
+    logger.info(
+        (
+            f"Starting LLMJob {llm_job.id}",
+            f" with parameters:\n\t{llm_job.parameters.model_dump_json(indent=2)}",
+        )
+    )
+    llms.start_llm_job_sync(llm_job_id=llm_job.id)
+
+    logger.info(f"LLMJob {llm_job.id} has finished!")