Merge pull request #415 from uhh-lt/remove-unused-data-from-dbs

Remove unused data from dbs
uhh-lt · Sep 25, 2024 · 5d3b2ed · 5d3b2ed
2 parents b5d37f8 + d14ed8a
commit 5d3b2ed
Show file tree

Hide file tree

Showing 12 changed files with 65 additions and 226 deletions.
diff --git a/backend/src/alembic/versions/53ec37ba68dd_remove_word_frequencies_from_sdocdata.py b/backend/src/alembic/versions/53ec37ba68dd_remove_word_frequencies_from_sdocdata.py
@@ -0,0 +1,40 @@
+"""remove word frequencies from sdocdata
+
+Revision ID: 53ec37ba68dd
+Revises: 45549c9c4ff2
+Create Date: 2024-09-17 11:13:19.039284
+
+"""
+
+from typing import Sequence, Union
+
+import sqlalchemy as sa
+
+from alembic import op
+
+# revision identifiers, used by Alembic.
+revision: str = "53ec37ba68dd"
+down_revision: Union[str, None] = "45549c9c4ff2"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("sourcedocumentdata", "word_frequencies")
+    # ### end Alembic commands ###
+
+
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "sourcedocumentdata",
+        sa.Column(
+            "word_frequencies",
+            sa.VARCHAR(),
+            server_default=sa.text("'[]'::character varying"),
+            autoincrement=False,
+            nullable=False,
+        ),
+    )
+    # ### end Alembic commands ###
diff --git a/backend/src/app/core/analysis/duplicate_finder_service.py b/backend/src/app/core/analysis/duplicate_finder_service.py
@@ -3,15 +3,12 @@
 
 import networkx as nx
 import numpy as np
-import srsly
 from loguru import logger
 from scipy import sparse
 from sklearn.metrics.pairwise import manhattan_distances
 
+from app.core.data.crud.word_frequency import crud_word_frequency
 from app.core.data.doc_type import DocType
-from app.core.data.dto.word_frequency import WordFrequencyRead
-from app.core.data.orm.source_document import SourceDocumentORM
-from app.core.data.orm.source_document_data import SourceDocumentDataORM
 from app.core.db.sql_service import SQLService
 from app.util.singleton_meta import SingletonMeta
 
@@ -27,31 +24,12 @@ def find_duplicate_text_sdocs(
         logger.info("Finding duplicate text sdocs")
         t0 = time.time()
         with self.sqls.db_session() as db:
-            result = (
-                db.query(
-                    SourceDocumentDataORM.id, SourceDocumentDataORM.word_frequencies
-                )
-                .join(
-                    SourceDocumentORM, SourceDocumentORM.id == SourceDocumentDataORM.id
-                )
-                .filter(
-                    SourceDocumentORM.project_id == project_id,
-                    SourceDocumentORM.doctype == DocType.text,
-                )
-                .all()
+            result = crud_word_frequency.read_by_project_and_doctype(
+                db, project_id=project_id, doctype=DocType.text
             )
         t1 = time.time()
         logger.info(f"query took: {t1 - t0}")
 
-        t0 = time.time()
-        result = [
-            WordFrequencyRead(sdoc_id=int(row[0]), **wf)
-            for row in result
-            for wf in srsly.json_loads(row[1])
-        ]
-        t1 = time.time()
-        logger.info(f"convert took: {t1 - t0}")
-
         t0 = time.time()
         # unique words in project
         words = set([r.word.lower() for r in result])

diff --git a/backend/src/app/core/data/crud/word_frequency.py b/backend/src/app/core/data/crud/word_frequency.py
@@ -1,5 +1,11 @@
+from typing import List
+
+from sqlalchemy.orm import Session
+
 from app.core.data.crud.crud_base import CRUDBase
-from app.core.data.dto.word_frequency import WordFrequencyCreate
+from app.core.data.doc_type import DocType
+from app.core.data.dto.word_frequency import WordFrequencyCreate, WordFrequencyRead
+from app.core.data.orm.source_document import SourceDocumentORM
 from app.core.data.orm.word_frequency import WordFrequencyORM
 
 
@@ -10,7 +16,19 @@ class CrudWordFrequency(
         None,
     ]
 ):
-    pass
+    def read_by_project_and_doctype(
+        self, db: Session, *, project_id: int, doctype: DocType
+    ) -> List[WordFrequencyRead]:
+        wf_orms = (
+            db.query(WordFrequencyORM)
+            .join(WordFrequencyORM.source_document)
+            .filter(
+                SourceDocumentORM.project_id == project_id,
+                SourceDocumentORM.doctype == doctype,
+            )
+            .all()
+        )
+        return [WordFrequencyRead.model_validate(wf) for wf in wf_orms]
 
 
 crud_word_frequency = CrudWordFrequency(WordFrequencyORM)
diff --git a/backend/src/app/core/data/dto/search.py b/backend/src/app/core/data/dto/search.py
@@ -25,37 +25,9 @@ class MemoQueryBase(BaseModel):
     )
 
 
-class ElasticSearchIntegerRange(BaseModel):
-    gte: int
-    lt: int
-
-
 class ElasticSearchDocumentCreate(BaseModel):
     filename: str = Field(description="The filename of the SourceDocument")
     content: str = Field(description="The raw text of the SourceDocument")
-    html: str = Field(description="The html of the SourceDocument")
-    tokens: List[str] = Field(
-        description="The list of the tokens in the SourceDocument"
-    )
-    token_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
-        description=(
-            "The list of character " "offsets for the tokens " "in the SourceDocument"
-        )
-    )
-    sentences: List[str] = Field(
-        description="The list of the sentences in the SourceDocument"
-    )
-    sentence_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
-        description=(
-            "The list of character "
-            "offsets for the "
-            "sentences "
-            "in the SourceDocument"
-        )
-    )
-    keywords: List[str] = Field(
-        description="The list of keywords of the SourceDocument"
-    )
     sdoc_id: int = Field(
         description="The ID of the SourceDocument as it is in the SQL DB"
     )
@@ -67,43 +39,6 @@ class ElasticSearchDocumentCreate(BaseModel):
     )
 
 
-class ElasticSearchDocumentRead(BaseModel):
-    filename: Optional[str] = Field(description="The filename of the SourceDocument")
-    content: Optional[str] = Field(description="The raw text of the SourceDocument")
-    html: Optional[str] = Field(description="The html of the SourceDocument")
-    tokens: Optional[List[str]] = Field(
-        description="The list of the tokens in the SourceDocument"
-    )
-    token_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
-        description=(
-            "The list of character " "offsets for the tokens " "in the SourceDocument"
-        )
-    )
-    sentences: Optional[List[str]] = Field(
-        description="The list of the sentences in the SourceDocument"
-    )
-    sentence_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
-        description=(
-            "The list of character "
-            "offsets for the "
-            "sentences "
-            "in the SourceDocument"
-        )
-    )
-    keywords: Optional[List[str]] = Field(
-        description="The list of keywords of the SourceDocument"
-    )
-    sdoc_id: Optional[int] = Field(
-        description="The ID of the SourceDocument as it is in the SQL DB"
-    )
-    project_id: Optional[int] = Field(
-        description="The ID of the Project the SourceDocument belongs to"
-    )
-    created: Optional[datetime] = Field(
-        description="The created date of the SourceDocument", default=datetime.now()
-    )
-
-
 class ElasticSearchDocumentHit(BaseModel):
     document_id: int = Field(description="The ID of the Document")
     score: Optional[float] = Field(

diff --git a/backend/src/app/core/data/dto/source_document_data.py b/backend/src/app/core/data/dto/source_document_data.py
@@ -37,8 +37,4 @@ class SourceDocumentDataRead(SourceDocumentDataBase):
 
 # Properties for creation
 class SourceDocumentDataCreate(SourceDocumentDataBase):
-    word_frequencies: str = Field(
-        description=(
-            "JSON Representation of List[WordFrequency] of the SourceDocument"
-        ),
-    )
+    pass
diff --git a/backend/src/app/core/data/orm/source_document_data.py b/backend/src/app/core/data/orm/source_document_data.py
@@ -29,13 +29,6 @@ class SourceDocumentDataORM(ORMBase):
     sentence_ends: Mapped[List[int]] = mapped_column(
         ARRAY(Integer), nullable=False, index=False
     )
-    # JSON representation of List[{word: str, count: int}]
-    word_frequencies: Mapped[str] = mapped_column(
-        String,
-        server_default="[]",
-        nullable=False,
-        index=False,
-    )
 
     @property
     def tokens(self):

diff --git a/backend/src/app/core/search/elasticsearch_service.py b/backend/src/app/core/search/elasticsearch_service.py
@@ -8,7 +8,6 @@
 from app.core.data.dto.search import (
     ElasticSearchDocumentCreate,
     ElasticSearchDocumentHit,
-    ElasticSearchDocumentRead,
     ElasticSearchMemoCreate,
     ElasticSearchMemoRead,
     ElasticSearchMemoUpdate,
@@ -18,16 +17,6 @@
 from config import conf
 
 
-class NoSuchSourceDocumentInElasticSearchError(Exception):
-    def __init__(self, proj_id: int, sdoc_id: int):
-        super().__init__(
-            (
-                f"There exists no SourceDocument with ID={sdoc_id} in Project {proj_id}"
-                " in the respective ElasticSearch Index!"
-            )
-        )
-
-
 class NoSuchMemoInElasticSearchError(Exception):
     def __init__(self, proj_id: int, memo_id: int):
         super().__init__(
@@ -203,77 +192,6 @@ def add_document_to_index(
         )
         return res["_id"]
 
-    def bulk_add_documents_to_index(
-        self, *, proj_id: int, esdocs: List[ElasticSearchDocumentCreate]
-    ) -> int:
-        idx_name = self.__get_index_name(proj_id=proj_id, index_type="doc")
-
-        def generate_actions_for_bulk_index():
-            for esdoc in esdocs:
-                doc = {
-                    "_index": idx_name,
-                    "_id": esdoc.sdoc_id,
-                    "_source": esdoc.model_dump(),
-                }
-                yield doc
-
-        num_indexed, num_errors = helpers.bulk(
-            client=self.__client,
-            actions=generate_actions_for_bulk_index(),
-            stats_only=True,
-        )
-        logger.debug(
-            f"Added {num_indexed} Documents to ElasticSearch Index {idx_name} with {num_errors} Errors!"
-        )
-        return num_indexed
-
-    def get_esdocs_by_sdoc_ids(
-        self, *, proj_id: int, sdoc_ids: Set[int], fields: Set[str]
-    ) -> List[ElasticSearchDocumentRead]:
-        if not fields.union(self.doc_index_fields):
-            raise NoSuchFieldInIndexError(
-                index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
-                fields=fields,
-                index_fields=self.doc_index_fields,
-            )
-        results = self.__client.mget(
-            index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
-            _source=list(fields),
-            body={"ids": list(sdoc_ids)},
-        )
-
-        esdocs = []
-        for res in results["docs"]:
-            if not res["found"]:
-                raise NoSuchSourceDocumentInElasticSearchError(
-                    proj_id=proj_id, sdoc_id=res["_id"]
-                )
-            esdocs.append(
-                ElasticSearchDocumentRead(sdoc_id=res["_id"], **res["_source"])
-            )
-
-        return esdocs
-
-    def get_esdoc_by_sdoc_id(
-        self, *, proj_id: int, sdoc_id: int, fields: Set[str]
-    ) -> Optional[ElasticSearchDocumentRead]:
-        if not fields.union(self.doc_index_fields):
-            raise NoSuchFieldInIndexError(
-                index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
-                fields=fields,
-                index_fields=self.doc_index_fields,
-            )
-        res = self.__client.get(
-            index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
-            id=str(sdoc_id),
-            _source=list(fields),
-        )
-        if not res["found"]:
-            raise NoSuchSourceDocumentInElasticSearchError(
-                proj_id=proj_id, sdoc_id=sdoc_id
-            )
-        return ElasticSearchDocumentRead(**res["_source"])
-
     def delete_document_from_index(self, proj_id: int, sdoc_id: int) -> None:
         self.__client.delete(
             index=self.__get_index_name(proj_id=proj_id, index_type="doc"),

diff --git a/backend/src/app/preprocessing/pipeline/steps/text/store_document_in_elasticsearch.py b/backend/src/app/preprocessing/pipeline/steps/text/store_document_in_elasticsearch.py
@@ -1,6 +1,5 @@
 from app.core.data.dto.search import (
     ElasticSearchDocumentCreate,
-    ElasticSearchIntegerRange,
 )
 from app.core.search.elasticsearch_service import ElasticSearchService
 from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
@@ -18,17 +17,6 @@ def store_document_in_elasticsearch(cargo: PipelineCargo) -> PipelineCargo:
     esdoc = ElasticSearchDocumentCreate(
         filename=pptd.filename,
         content=pptd.text,
-        html=pptd.html,
-        tokens=pptd.tokens,
-        token_character_offsets=[
-            ElasticSearchIntegerRange(gte=o[0], lt=o[1])
-            for o in pptd.token_character_offsets
-        ],
-        sentences=[s.text for s in pptd.sentences],
-        sentence_character_offsets=[
-            ElasticSearchIntegerRange(gte=s.start, lt=s.end) for s in pptd.sentences
-        ],
-        keywords=pptd.keywords,
         sdoc_id=sdoc_id,
         project_id=pptd.project_id,
     )

diff --git a/backend/src/app/preprocessing/pipeline/steps/text/write_pptd_to_database.py b/backend/src/app/preprocessing/pipeline/steps/text/write_pptd_to_database.py
@@ -1,6 +1,5 @@
 import traceback
 
-import srsly
 from loguru import logger
 from psycopg2 import OperationalError
 from sqlalchemy.orm import Session
@@ -53,10 +52,6 @@ def _create_and_persist_sdoc(db: Session, pptd: PreProTextDoc) -> SourceDocument
 def _persist_sdoc_data(
     db: Session, sdoc_db_obj: SourceDocumentORM, pptd: PreProTextDoc
 ) -> None:
-    word_frequencies_str = srsly.json_dumps(
-        [{"word": word, "count": count} for word, count in pptd.word_freqs.items()]
-    )
-
     sdoc_data = SourceDocumentDataCreate(
         id=sdoc_db_obj.id,
         content=pptd.text,
@@ -65,7 +60,6 @@ def _persist_sdoc_data(
         token_ends=[e for _, e in pptd.token_character_offsets],
         sentence_starts=[s.start for s in pptd.sentences],
         sentence_ends=[s.end for s in pptd.sentences],
-        word_frequencies=word_frequencies_str,
     )
     crud_sdoc_data.create(db=db, create_dto=sdoc_data)