Skip to content

Commit

Permalink
Merge pull request #415 from uhh-lt/remove-unused-data-from-dbs
Browse files Browse the repository at this point in the history
Remove unused data from dbs
  • Loading branch information
bigabig authored Sep 25, 2024
2 parents b5d37f8 + d14ed8a commit 5d3b2ed
Show file tree
Hide file tree
Showing 12 changed files with 65 additions and 226 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""remove word frequencies from sdocdata
Revision ID: 53ec37ba68dd
Revises: 45549c9c4ff2
Create Date: 2024-09-17 11:13:19.039284
"""

from typing import Sequence, Union

import sqlalchemy as sa

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "53ec37ba68dd"
down_revision: Union[str, None] = "45549c9c4ff2"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("sourcedocumentdata", "word_frequencies")
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"sourcedocumentdata",
sa.Column(
"word_frequencies",
sa.VARCHAR(),
server_default=sa.text("'[]'::character varying"),
autoincrement=False,
nullable=False,
),
)
# ### end Alembic commands ###
28 changes: 3 additions & 25 deletions backend/src/app/core/analysis/duplicate_finder_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,12 @@

import networkx as nx
import numpy as np
import srsly
from loguru import logger
from scipy import sparse
from sklearn.metrics.pairwise import manhattan_distances

from app.core.data.crud.word_frequency import crud_word_frequency
from app.core.data.doc_type import DocType
from app.core.data.dto.word_frequency import WordFrequencyRead
from app.core.data.orm.source_document import SourceDocumentORM
from app.core.data.orm.source_document_data import SourceDocumentDataORM
from app.core.db.sql_service import SQLService
from app.util.singleton_meta import SingletonMeta

Expand All @@ -27,31 +24,12 @@ def find_duplicate_text_sdocs(
logger.info("Finding duplicate text sdocs")
t0 = time.time()
with self.sqls.db_session() as db:
result = (
db.query(
SourceDocumentDataORM.id, SourceDocumentDataORM.word_frequencies
)
.join(
SourceDocumentORM, SourceDocumentORM.id == SourceDocumentDataORM.id
)
.filter(
SourceDocumentORM.project_id == project_id,
SourceDocumentORM.doctype == DocType.text,
)
.all()
result = crud_word_frequency.read_by_project_and_doctype(
db, project_id=project_id, doctype=DocType.text
)
t1 = time.time()
logger.info(f"query took: {t1 - t0}")

t0 = time.time()
result = [
WordFrequencyRead(sdoc_id=int(row[0]), **wf)
for row in result
for wf in srsly.json_loads(row[1])
]
t1 = time.time()
logger.info(f"convert took: {t1 - t0}")

t0 = time.time()
# unique words in project
words = set([r.word.lower() for r in result])
Expand Down
22 changes: 20 additions & 2 deletions backend/src/app/core/data/crud/word_frequency.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
from typing import List

from sqlalchemy.orm import Session

from app.core.data.crud.crud_base import CRUDBase
from app.core.data.dto.word_frequency import WordFrequencyCreate
from app.core.data.doc_type import DocType
from app.core.data.dto.word_frequency import WordFrequencyCreate, WordFrequencyRead
from app.core.data.orm.source_document import SourceDocumentORM
from app.core.data.orm.word_frequency import WordFrequencyORM


Expand All @@ -10,7 +16,19 @@ class CrudWordFrequency(
None,
]
):
pass
def read_by_project_and_doctype(
self, db: Session, *, project_id: int, doctype: DocType
) -> List[WordFrequencyRead]:
wf_orms = (
db.query(WordFrequencyORM)
.join(WordFrequencyORM.source_document)
.filter(
SourceDocumentORM.project_id == project_id,
SourceDocumentORM.doctype == doctype,
)
.all()
)
return [WordFrequencyRead.model_validate(wf) for wf in wf_orms]


crud_word_frequency = CrudWordFrequency(WordFrequencyORM)
65 changes: 0 additions & 65 deletions backend/src/app/core/data/dto/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,37 +25,9 @@ class MemoQueryBase(BaseModel):
)


class ElasticSearchIntegerRange(BaseModel):
gte: int
lt: int


class ElasticSearchDocumentCreate(BaseModel):
filename: str = Field(description="The filename of the SourceDocument")
content: str = Field(description="The raw text of the SourceDocument")
html: str = Field(description="The html of the SourceDocument")
tokens: List[str] = Field(
description="The list of the tokens in the SourceDocument"
)
token_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
description=(
"The list of character " "offsets for the tokens " "in the SourceDocument"
)
)
sentences: List[str] = Field(
description="The list of the sentences in the SourceDocument"
)
sentence_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
description=(
"The list of character "
"offsets for the "
"sentences "
"in the SourceDocument"
)
)
keywords: List[str] = Field(
description="The list of keywords of the SourceDocument"
)
sdoc_id: int = Field(
description="The ID of the SourceDocument as it is in the SQL DB"
)
Expand All @@ -67,43 +39,6 @@ class ElasticSearchDocumentCreate(BaseModel):
)


class ElasticSearchDocumentRead(BaseModel):
filename: Optional[str] = Field(description="The filename of the SourceDocument")
content: Optional[str] = Field(description="The raw text of the SourceDocument")
html: Optional[str] = Field(description="The html of the SourceDocument")
tokens: Optional[List[str]] = Field(
description="The list of the tokens in the SourceDocument"
)
token_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
description=(
"The list of character " "offsets for the tokens " "in the SourceDocument"
)
)
sentences: Optional[List[str]] = Field(
description="The list of the sentences in the SourceDocument"
)
sentence_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
description=(
"The list of character "
"offsets for the "
"sentences "
"in the SourceDocument"
)
)
keywords: Optional[List[str]] = Field(
description="The list of keywords of the SourceDocument"
)
sdoc_id: Optional[int] = Field(
description="The ID of the SourceDocument as it is in the SQL DB"
)
project_id: Optional[int] = Field(
description="The ID of the Project the SourceDocument belongs to"
)
created: Optional[datetime] = Field(
description="The created date of the SourceDocument", default=datetime.now()
)


class ElasticSearchDocumentHit(BaseModel):
document_id: int = Field(description="The ID of the Document")
score: Optional[float] = Field(
Expand Down
6 changes: 1 addition & 5 deletions backend/src/app/core/data/dto/source_document_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,4 @@ class SourceDocumentDataRead(SourceDocumentDataBase):

# Properties for creation
class SourceDocumentDataCreate(SourceDocumentDataBase):
word_frequencies: str = Field(
description=(
"JSON Representation of List[WordFrequency] of the SourceDocument"
),
)
pass
7 changes: 0 additions & 7 deletions backend/src/app/core/data/orm/source_document_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,6 @@ class SourceDocumentDataORM(ORMBase):
sentence_ends: Mapped[List[int]] = mapped_column(
ARRAY(Integer), nullable=False, index=False
)
# JSON representation of List[{word: str, count: int}]
word_frequencies: Mapped[str] = mapped_column(
String,
server_default="[]",
nullable=False,
index=False,
)

@property
def tokens(self):
Expand Down
82 changes: 0 additions & 82 deletions backend/src/app/core/search/elasticsearch_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from app.core.data.dto.search import (
ElasticSearchDocumentCreate,
ElasticSearchDocumentHit,
ElasticSearchDocumentRead,
ElasticSearchMemoCreate,
ElasticSearchMemoRead,
ElasticSearchMemoUpdate,
Expand All @@ -18,16 +17,6 @@
from config import conf


class NoSuchSourceDocumentInElasticSearchError(Exception):
def __init__(self, proj_id: int, sdoc_id: int):
super().__init__(
(
f"There exists no SourceDocument with ID={sdoc_id} in Project {proj_id}"
" in the respective ElasticSearch Index!"
)
)


class NoSuchMemoInElasticSearchError(Exception):
def __init__(self, proj_id: int, memo_id: int):
super().__init__(
Expand Down Expand Up @@ -203,77 +192,6 @@ def add_document_to_index(
)
return res["_id"]

def bulk_add_documents_to_index(
self, *, proj_id: int, esdocs: List[ElasticSearchDocumentCreate]
) -> int:
idx_name = self.__get_index_name(proj_id=proj_id, index_type="doc")

def generate_actions_for_bulk_index():
for esdoc in esdocs:
doc = {
"_index": idx_name,
"_id": esdoc.sdoc_id,
"_source": esdoc.model_dump(),
}
yield doc

num_indexed, num_errors = helpers.bulk(
client=self.__client,
actions=generate_actions_for_bulk_index(),
stats_only=True,
)
logger.debug(
f"Added {num_indexed} Documents to ElasticSearch Index {idx_name} with {num_errors} Errors!"
)
return num_indexed

def get_esdocs_by_sdoc_ids(
self, *, proj_id: int, sdoc_ids: Set[int], fields: Set[str]
) -> List[ElasticSearchDocumentRead]:
if not fields.union(self.doc_index_fields):
raise NoSuchFieldInIndexError(
index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
fields=fields,
index_fields=self.doc_index_fields,
)
results = self.__client.mget(
index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
_source=list(fields),
body={"ids": list(sdoc_ids)},
)

esdocs = []
for res in results["docs"]:
if not res["found"]:
raise NoSuchSourceDocumentInElasticSearchError(
proj_id=proj_id, sdoc_id=res["_id"]
)
esdocs.append(
ElasticSearchDocumentRead(sdoc_id=res["_id"], **res["_source"])
)

return esdocs

def get_esdoc_by_sdoc_id(
self, *, proj_id: int, sdoc_id: int, fields: Set[str]
) -> Optional[ElasticSearchDocumentRead]:
if not fields.union(self.doc_index_fields):
raise NoSuchFieldInIndexError(
index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
fields=fields,
index_fields=self.doc_index_fields,
)
res = self.__client.get(
index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
id=str(sdoc_id),
_source=list(fields),
)
if not res["found"]:
raise NoSuchSourceDocumentInElasticSearchError(
proj_id=proj_id, sdoc_id=sdoc_id
)
return ElasticSearchDocumentRead(**res["_source"])

def delete_document_from_index(self, proj_id: int, sdoc_id: int) -> None:
self.__client.delete(
index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from app.core.data.dto.search import (
ElasticSearchDocumentCreate,
ElasticSearchIntegerRange,
)
from app.core.search.elasticsearch_service import ElasticSearchService
from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
Expand All @@ -18,17 +17,6 @@ def store_document_in_elasticsearch(cargo: PipelineCargo) -> PipelineCargo:
esdoc = ElasticSearchDocumentCreate(
filename=pptd.filename,
content=pptd.text,
html=pptd.html,
tokens=pptd.tokens,
token_character_offsets=[
ElasticSearchIntegerRange(gte=o[0], lt=o[1])
for o in pptd.token_character_offsets
],
sentences=[s.text for s in pptd.sentences],
sentence_character_offsets=[
ElasticSearchIntegerRange(gte=s.start, lt=s.end) for s in pptd.sentences
],
keywords=pptd.keywords,
sdoc_id=sdoc_id,
project_id=pptd.project_id,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import traceback

import srsly
from loguru import logger
from psycopg2 import OperationalError
from sqlalchemy.orm import Session
Expand Down Expand Up @@ -53,10 +52,6 @@ def _create_and_persist_sdoc(db: Session, pptd: PreProTextDoc) -> SourceDocument
def _persist_sdoc_data(
db: Session, sdoc_db_obj: SourceDocumentORM, pptd: PreProTextDoc
) -> None:
word_frequencies_str = srsly.json_dumps(
[{"word": word, "count": count} for word, count in pptd.word_freqs.items()]
)

sdoc_data = SourceDocumentDataCreate(
id=sdoc_db_obj.id,
content=pptd.text,
Expand All @@ -65,7 +60,6 @@ def _persist_sdoc_data(
token_ends=[e for _, e in pptd.token_character_offsets],
sentence_starts=[s.start for s in pptd.sentences],
sentence_ends=[s.end for s in pptd.sentences],
word_frequencies=word_frequencies_str,
)
crud_sdoc_data.create(db=db, create_dto=sdoc_data)

Expand Down
Loading

0 comments on commit 5d3b2ed

Please sign in to comment.