diff --git a/backend/src/alembic/versions/53ec37ba68dd_remove_word_frequencies_from_sdocdata.py b/backend/src/alembic/versions/53ec37ba68dd_remove_word_frequencies_from_sdocdata.py new file mode 100644 index 000000000..f0014bded --- /dev/null +++ b/backend/src/alembic/versions/53ec37ba68dd_remove_word_frequencies_from_sdocdata.py @@ -0,0 +1,40 @@ +"""remove word frequencies from sdocdata + +Revision ID: 53ec37ba68dd +Revises: 45549c9c4ff2 +Create Date: 2024-09-17 11:13:19.039284 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "53ec37ba68dd" +down_revision: Union[str, None] = "45549c9c4ff2" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("sourcedocumentdata", "word_frequencies") + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "sourcedocumentdata", + sa.Column( + "word_frequencies", + sa.VARCHAR(), + server_default=sa.text("'[]'::character varying"), + autoincrement=False, + nullable=False, + ), + ) + # ### end Alembic commands ### diff --git a/backend/src/app/core/analysis/duplicate_finder_service.py b/backend/src/app/core/analysis/duplicate_finder_service.py index 235a10a7e..ab24b239b 100644 --- a/backend/src/app/core/analysis/duplicate_finder_service.py +++ b/backend/src/app/core/analysis/duplicate_finder_service.py @@ -3,15 +3,12 @@ import networkx as nx import numpy as np -import srsly from loguru import logger from scipy import sparse from sklearn.metrics.pairwise import manhattan_distances +from app.core.data.crud.word_frequency import crud_word_frequency from app.core.data.doc_type import DocType -from app.core.data.dto.word_frequency import WordFrequencyRead -from app.core.data.orm.source_document import SourceDocumentORM -from app.core.data.orm.source_document_data import SourceDocumentDataORM from app.core.db.sql_service import SQLService from app.util.singleton_meta import SingletonMeta @@ -27,31 +24,12 @@ def find_duplicate_text_sdocs( logger.info("Finding duplicate text sdocs") t0 = time.time() with self.sqls.db_session() as db: - result = ( - db.query( - SourceDocumentDataORM.id, SourceDocumentDataORM.word_frequencies - ) - .join( - SourceDocumentORM, SourceDocumentORM.id == SourceDocumentDataORM.id - ) - .filter( - SourceDocumentORM.project_id == project_id, - SourceDocumentORM.doctype == DocType.text, - ) - .all() + result = crud_word_frequency.read_by_project_and_doctype( + db, project_id=project_id, doctype=DocType.text ) t1 = time.time() logger.info(f"query took: {t1 - t0}") - t0 = time.time() - result = [ - WordFrequencyRead(sdoc_id=int(row[0]), **wf) - for row in result - for wf in srsly.json_loads(row[1]) - ] - t1 = time.time() - logger.info(f"convert took: {t1 - t0}") - t0 = time.time() # unique words in project words = set([r.word.lower() for r in result]) diff --git a/backend/src/app/core/data/crud/word_frequency.py b/backend/src/app/core/data/crud/word_frequency.py index 3988c84aa..18e4a0345 100644 --- a/backend/src/app/core/data/crud/word_frequency.py +++ b/backend/src/app/core/data/crud/word_frequency.py @@ -1,5 +1,11 @@ +from typing import List + +from sqlalchemy.orm import Session + from app.core.data.crud.crud_base import CRUDBase -from app.core.data.dto.word_frequency import WordFrequencyCreate +from app.core.data.doc_type import DocType +from app.core.data.dto.word_frequency import WordFrequencyCreate, WordFrequencyRead +from app.core.data.orm.source_document import SourceDocumentORM from app.core.data.orm.word_frequency import WordFrequencyORM @@ -10,7 +16,19 @@ class CrudWordFrequency( None, ] ): - pass + def read_by_project_and_doctype( + self, db: Session, *, project_id: int, doctype: DocType + ) -> List[WordFrequencyRead]: + wf_orms = ( + db.query(WordFrequencyORM) + .join(WordFrequencyORM.source_document) + .filter( + SourceDocumentORM.project_id == project_id, + SourceDocumentORM.doctype == doctype, + ) + .all() + ) + return [WordFrequencyRead.model_validate(wf) for wf in wf_orms] crud_word_frequency = CrudWordFrequency(WordFrequencyORM) diff --git a/backend/src/app/core/data/dto/search.py b/backend/src/app/core/data/dto/search.py index cd29152ac..0f6f944fd 100644 --- a/backend/src/app/core/data/dto/search.py +++ b/backend/src/app/core/data/dto/search.py @@ -25,37 +25,9 @@ class MemoQueryBase(BaseModel): ) -class ElasticSearchIntegerRange(BaseModel): - gte: int - lt: int - - class ElasticSearchDocumentCreate(BaseModel): filename: str = Field(description="The filename of the SourceDocument") content: str = Field(description="The raw text of the SourceDocument") - html: str = Field(description="The html of the SourceDocument") - tokens: List[str] = Field( - description="The list of the tokens in the SourceDocument" - ) - token_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field( - description=( - "The list of character " "offsets for the tokens " "in the SourceDocument" - ) - ) - sentences: List[str] = Field( - description="The list of the sentences in the SourceDocument" - ) - sentence_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field( - description=( - "The list of character " - "offsets for the " - "sentences " - "in the SourceDocument" - ) - ) - keywords: List[str] = Field( - description="The list of keywords of the SourceDocument" - ) sdoc_id: int = Field( description="The ID of the SourceDocument as it is in the SQL DB" ) @@ -67,43 +39,6 @@ class ElasticSearchDocumentCreate(BaseModel): ) -class ElasticSearchDocumentRead(BaseModel): - filename: Optional[str] = Field(description="The filename of the SourceDocument") - content: Optional[str] = Field(description="The raw text of the SourceDocument") - html: Optional[str] = Field(description="The html of the SourceDocument") - tokens: Optional[List[str]] = Field( - description="The list of the tokens in the SourceDocument" - ) - token_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field( - description=( - "The list of character " "offsets for the tokens " "in the SourceDocument" - ) - ) - sentences: Optional[List[str]] = Field( - description="The list of the sentences in the SourceDocument" - ) - sentence_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field( - description=( - "The list of character " - "offsets for the " - "sentences " - "in the SourceDocument" - ) - ) - keywords: Optional[List[str]] = Field( - description="The list of keywords of the SourceDocument" - ) - sdoc_id: Optional[int] = Field( - description="The ID of the SourceDocument as it is in the SQL DB" - ) - project_id: Optional[int] = Field( - description="The ID of the Project the SourceDocument belongs to" - ) - created: Optional[datetime] = Field( - description="The created date of the SourceDocument", default=datetime.now() - ) - - class ElasticSearchDocumentHit(BaseModel): document_id: int = Field(description="The ID of the Document") score: Optional[float] = Field( diff --git a/backend/src/app/core/data/dto/source_document_data.py b/backend/src/app/core/data/dto/source_document_data.py index fe68918a2..d10a292ae 100644 --- a/backend/src/app/core/data/dto/source_document_data.py +++ b/backend/src/app/core/data/dto/source_document_data.py @@ -37,8 +37,4 @@ class SourceDocumentDataRead(SourceDocumentDataBase): # Properties for creation class SourceDocumentDataCreate(SourceDocumentDataBase): - word_frequencies: str = Field( - description=( - "JSON Representation of List[WordFrequency] of the SourceDocument" - ), - ) + pass diff --git a/backend/src/app/core/data/orm/source_document_data.py b/backend/src/app/core/data/orm/source_document_data.py index f4907ce69..1ad322b94 100644 --- a/backend/src/app/core/data/orm/source_document_data.py +++ b/backend/src/app/core/data/orm/source_document_data.py @@ -29,13 +29,6 @@ class SourceDocumentDataORM(ORMBase): sentence_ends: Mapped[List[int]] = mapped_column( ARRAY(Integer), nullable=False, index=False ) - # JSON representation of List[{word: str, count: int}] - word_frequencies: Mapped[str] = mapped_column( - String, - server_default="[]", - nullable=False, - index=False, - ) @property def tokens(self): diff --git a/backend/src/app/core/search/elasticsearch_service.py b/backend/src/app/core/search/elasticsearch_service.py index 0ec0e99d1..a9d54061a 100644 --- a/backend/src/app/core/search/elasticsearch_service.py +++ b/backend/src/app/core/search/elasticsearch_service.py @@ -8,7 +8,6 @@ from app.core.data.dto.search import ( ElasticSearchDocumentCreate, ElasticSearchDocumentHit, - ElasticSearchDocumentRead, ElasticSearchMemoCreate, ElasticSearchMemoRead, ElasticSearchMemoUpdate, @@ -18,16 +17,6 @@ from config import conf -class NoSuchSourceDocumentInElasticSearchError(Exception): - def __init__(self, proj_id: int, sdoc_id: int): - super().__init__( - ( - f"There exists no SourceDocument with ID={sdoc_id} in Project {proj_id}" - " in the respective ElasticSearch Index!" - ) - ) - - class NoSuchMemoInElasticSearchError(Exception): def __init__(self, proj_id: int, memo_id: int): super().__init__( @@ -203,77 +192,6 @@ def add_document_to_index( ) return res["_id"] - def bulk_add_documents_to_index( - self, *, proj_id: int, esdocs: List[ElasticSearchDocumentCreate] - ) -> int: - idx_name = self.__get_index_name(proj_id=proj_id, index_type="doc") - - def generate_actions_for_bulk_index(): - for esdoc in esdocs: - doc = { - "_index": idx_name, - "_id": esdoc.sdoc_id, - "_source": esdoc.model_dump(), - } - yield doc - - num_indexed, num_errors = helpers.bulk( - client=self.__client, - actions=generate_actions_for_bulk_index(), - stats_only=True, - ) - logger.debug( - f"Added {num_indexed} Documents to ElasticSearch Index {idx_name} with {num_errors} Errors!" - ) - return num_indexed - - def get_esdocs_by_sdoc_ids( - self, *, proj_id: int, sdoc_ids: Set[int], fields: Set[str] - ) -> List[ElasticSearchDocumentRead]: - if not fields.union(self.doc_index_fields): - raise NoSuchFieldInIndexError( - index=self.__get_index_name(proj_id=proj_id, index_type="doc"), - fields=fields, - index_fields=self.doc_index_fields, - ) - results = self.__client.mget( - index=self.__get_index_name(proj_id=proj_id, index_type="doc"), - _source=list(fields), - body={"ids": list(sdoc_ids)}, - ) - - esdocs = [] - for res in results["docs"]: - if not res["found"]: - raise NoSuchSourceDocumentInElasticSearchError( - proj_id=proj_id, sdoc_id=res["_id"] - ) - esdocs.append( - ElasticSearchDocumentRead(sdoc_id=res["_id"], **res["_source"]) - ) - - return esdocs - - def get_esdoc_by_sdoc_id( - self, *, proj_id: int, sdoc_id: int, fields: Set[str] - ) -> Optional[ElasticSearchDocumentRead]: - if not fields.union(self.doc_index_fields): - raise NoSuchFieldInIndexError( - index=self.__get_index_name(proj_id=proj_id, index_type="doc"), - fields=fields, - index_fields=self.doc_index_fields, - ) - res = self.__client.get( - index=self.__get_index_name(proj_id=proj_id, index_type="doc"), - id=str(sdoc_id), - _source=list(fields), - ) - if not res["found"]: - raise NoSuchSourceDocumentInElasticSearchError( - proj_id=proj_id, sdoc_id=sdoc_id - ) - return ElasticSearchDocumentRead(**res["_source"]) - def delete_document_from_index(self, proj_id: int, sdoc_id: int) -> None: self.__client.delete( index=self.__get_index_name(proj_id=proj_id, index_type="doc"), diff --git a/backend/src/app/preprocessing/pipeline/steps/text/store_document_in_elasticsearch.py b/backend/src/app/preprocessing/pipeline/steps/text/store_document_in_elasticsearch.py index 48479cff5..ffcf4f17f 100644 --- a/backend/src/app/preprocessing/pipeline/steps/text/store_document_in_elasticsearch.py +++ b/backend/src/app/preprocessing/pipeline/steps/text/store_document_in_elasticsearch.py @@ -1,6 +1,5 @@ from app.core.data.dto.search import ( ElasticSearchDocumentCreate, - ElasticSearchIntegerRange, ) from app.core.search.elasticsearch_service import ElasticSearchService from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo @@ -18,17 +17,6 @@ def store_document_in_elasticsearch(cargo: PipelineCargo) -> PipelineCargo: esdoc = ElasticSearchDocumentCreate( filename=pptd.filename, content=pptd.text, - html=pptd.html, - tokens=pptd.tokens, - token_character_offsets=[ - ElasticSearchIntegerRange(gte=o[0], lt=o[1]) - for o in pptd.token_character_offsets - ], - sentences=[s.text for s in pptd.sentences], - sentence_character_offsets=[ - ElasticSearchIntegerRange(gte=s.start, lt=s.end) for s in pptd.sentences - ], - keywords=pptd.keywords, sdoc_id=sdoc_id, project_id=pptd.project_id, ) diff --git a/backend/src/app/preprocessing/pipeline/steps/text/write_pptd_to_database.py b/backend/src/app/preprocessing/pipeline/steps/text/write_pptd_to_database.py index 297ba780b..992894120 100644 --- a/backend/src/app/preprocessing/pipeline/steps/text/write_pptd_to_database.py +++ b/backend/src/app/preprocessing/pipeline/steps/text/write_pptd_to_database.py @@ -1,6 +1,5 @@ import traceback -import srsly from loguru import logger from psycopg2 import OperationalError from sqlalchemy.orm import Session @@ -53,10 +52,6 @@ def _create_and_persist_sdoc(db: Session, pptd: PreProTextDoc) -> SourceDocument def _persist_sdoc_data( db: Session, sdoc_db_obj: SourceDocumentORM, pptd: PreProTextDoc ) -> None: - word_frequencies_str = srsly.json_dumps( - [{"word": word, "count": count} for word, count in pptd.word_freqs.items()] - ) - sdoc_data = SourceDocumentDataCreate( id=sdoc_db_obj.id, content=pptd.text, @@ -65,7 +60,6 @@ def _persist_sdoc_data( token_ends=[e for _, e in pptd.token_character_offsets], sentence_starts=[s.start for s in pptd.sentences], sentence_ends=[s.end for s in pptd.sentences], - word_frequencies=word_frequencies_str, ) crud_sdoc_data.create(db=db, create_dto=sdoc_data) diff --git a/backend/src/configs/default_sdoc_index_mapping.json b/backend/src/configs/default_sdoc_index_mapping.json index a31d66249..654354774 100644 --- a/backend/src/configs/default_sdoc_index_mapping.json +++ b/backend/src/configs/default_sdoc_index_mapping.json @@ -6,18 +6,6 @@ "content": { "type": "text" }, - "html": { - "type": "text" - }, - "tokens": { - "type": "keyword" - }, - "token_character_offsets": { - "type": "integer_range" - }, - "keywords": { - "type": "keyword" - }, "sdoc_id": { "type": "long" }, diff --git a/backend/src/main.py b/backend/src/main.py index 9d9c51672..65a3d0644 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -87,7 +87,6 @@ ) from app.core.search.elasticsearch_service import ( NoSuchMemoInElasticSearchError, - NoSuchSourceDocumentInElasticSearchError, ) from config import conf @@ -198,13 +197,6 @@ async def export_job_preparation_error_handler(_, exc: ExportJobPreparationError return PlainTextResponse(str(exc), status_code=500) -@app.exception_handler(NoSuchSourceDocumentInElasticSearchError) -async def no_such_sdoc_in_es_error_handler( - _, exc: NoSuchSourceDocumentInElasticSearchError -): - return PlainTextResponse(str(exc), status_code=500) - - @app.exception_handler(NoSuchMemoInElasticSearchError) async def no_such_memo_in_es_error_handler(_, exc: NoSuchMemoInElasticSearchError): return PlainTextResponse(str(exc), status_code=500) diff --git a/backend/src/migration/migrate.py b/backend/src/migration/migrate.py index 5adedadd4..107cb9742 100644 --- a/backend/src/migration/migrate.py +++ b/backend/src/migration/migrate.py @@ -12,7 +12,6 @@ from app.core.data.crud.source_document_metadata import crud_sdoc_meta from app.core.data.doc_type import DocType from app.core.data.dto.project_metadata import ProjectMetadataCreate -from app.core.data.dto.search import ElasticSearchDocumentRead from app.core.data.dto.source_document import SDocStatus from app.core.data.dto.source_document_data import SourceDocumentDataCreate from app.core.data.dto.source_document_metadata import SourceDocumentMetadataCreate @@ -138,7 +137,7 @@ def __migrate_project_docs( def __es_doc_to_sdoc_data( - es_doc: ElasticSearchDocumentRead, + es_doc, ) -> SourceDocumentDataCreate: data = SourceDocumentDataCreate( id=es_doc.sdoc_id,