Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove unused data from dbs #415

Merged
merged 5 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""remove word frequencies from sdocdata

Revision ID: 53ec37ba68dd
Revises: 45549c9c4ff2
Create Date: 2024-09-17 11:13:19.039284

"""

from typing import Sequence, Union

import sqlalchemy as sa

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "53ec37ba68dd"
down_revision: Union[str, None] = "45549c9c4ff2"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("sourcedocumentdata", "word_frequencies")
# ### end Alembic commands ###


def downgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"sourcedocumentdata",
sa.Column(
"word_frequencies",
sa.VARCHAR(),
server_default=sa.text("'[]'::character varying"),
autoincrement=False,
nullable=False,
),
)
# ### end Alembic commands ###
28 changes: 3 additions & 25 deletions backend/src/app/core/analysis/duplicate_finder_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,12 @@

import networkx as nx
import numpy as np
import srsly
from loguru import logger
from scipy import sparse
from sklearn.metrics.pairwise import manhattan_distances

from app.core.data.crud.word_frequency import crud_word_frequency
from app.core.data.doc_type import DocType
from app.core.data.dto.word_frequency import WordFrequencyRead
from app.core.data.orm.source_document import SourceDocumentORM
from app.core.data.orm.source_document_data import SourceDocumentDataORM
from app.core.db.sql_service import SQLService
from app.util.singleton_meta import SingletonMeta

Expand All @@ -27,31 +24,12 @@ def find_duplicate_text_sdocs(
logger.info("Finding duplicate text sdocs")
t0 = time.time()
with self.sqls.db_session() as db:
result = (
db.query(
SourceDocumentDataORM.id, SourceDocumentDataORM.word_frequencies
)
.join(
SourceDocumentORM, SourceDocumentORM.id == SourceDocumentDataORM.id
)
.filter(
SourceDocumentORM.project_id == project_id,
SourceDocumentORM.doctype == DocType.text,
)
.all()
result = crud_word_frequency.read_by_project_and_doctype(
db, project_id=project_id, doctype=DocType.text
)
t1 = time.time()
logger.info(f"query took: {t1 - t0}")

t0 = time.time()
result = [
WordFrequencyRead(sdoc_id=int(row[0]), **wf)
for row in result
for wf in srsly.json_loads(row[1])
]
t1 = time.time()
logger.info(f"convert took: {t1 - t0}")

t0 = time.time()
# unique words in project
words = set([r.word.lower() for r in result])
Expand Down
22 changes: 20 additions & 2 deletions backend/src/app/core/data/crud/word_frequency.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
from typing import List

from sqlalchemy.orm import Session

from app.core.data.crud.crud_base import CRUDBase
from app.core.data.dto.word_frequency import WordFrequencyCreate
from app.core.data.doc_type import DocType
from app.core.data.dto.word_frequency import WordFrequencyCreate, WordFrequencyRead
from app.core.data.orm.source_document import SourceDocumentORM
from app.core.data.orm.word_frequency import WordFrequencyORM


Expand All @@ -10,7 +16,19 @@ class CrudWordFrequency(
None,
]
):
pass
def read_by_project_and_doctype(
self, db: Session, *, project_id: int, doctype: DocType
) -> List[WordFrequencyRead]:
wf_orms = (
db.query(WordFrequencyORM)
.join(WordFrequencyORM.source_document)
.filter(
SourceDocumentORM.project_id == project_id,
SourceDocumentORM.doctype == doctype,
)
.all()
)
return [WordFrequencyRead.model_validate(wf) for wf in wf_orms]


crud_word_frequency = CrudWordFrequency(WordFrequencyORM)
65 changes: 0 additions & 65 deletions backend/src/app/core/data/dto/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,37 +25,9 @@ class MemoQueryBase(BaseModel):
)


class ElasticSearchIntegerRange(BaseModel):
gte: int
lt: int


class ElasticSearchDocumentCreate(BaseModel):
filename: str = Field(description="The filename of the SourceDocument")
content: str = Field(description="The raw text of the SourceDocument")
html: str = Field(description="The html of the SourceDocument")
tokens: List[str] = Field(
description="The list of the tokens in the SourceDocument"
)
token_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
description=(
"The list of character " "offsets for the tokens " "in the SourceDocument"
)
)
sentences: List[str] = Field(
description="The list of the sentences in the SourceDocument"
)
sentence_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
description=(
"The list of character "
"offsets for the "
"sentences "
"in the SourceDocument"
)
)
keywords: List[str] = Field(
description="The list of keywords of the SourceDocument"
)
sdoc_id: int = Field(
description="The ID of the SourceDocument as it is in the SQL DB"
)
Expand All @@ -67,43 +39,6 @@ class ElasticSearchDocumentCreate(BaseModel):
)


class ElasticSearchDocumentRead(BaseModel):
filename: Optional[str] = Field(description="The filename of the SourceDocument")
content: Optional[str] = Field(description="The raw text of the SourceDocument")
html: Optional[str] = Field(description="The html of the SourceDocument")
tokens: Optional[List[str]] = Field(
description="The list of the tokens in the SourceDocument"
)
token_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
description=(
"The list of character " "offsets for the tokens " "in the SourceDocument"
)
)
sentences: Optional[List[str]] = Field(
description="The list of the sentences in the SourceDocument"
)
sentence_character_offsets: Optional[List[ElasticSearchIntegerRange]] = Field(
description=(
"The list of character "
"offsets for the "
"sentences "
"in the SourceDocument"
)
)
keywords: Optional[List[str]] = Field(
description="The list of keywords of the SourceDocument"
)
sdoc_id: Optional[int] = Field(
description="The ID of the SourceDocument as it is in the SQL DB"
)
project_id: Optional[int] = Field(
description="The ID of the Project the SourceDocument belongs to"
)
created: Optional[datetime] = Field(
description="The created date of the SourceDocument", default=datetime.now()
)


class ElasticSearchDocumentHit(BaseModel):
document_id: int = Field(description="The ID of the Document")
score: Optional[float] = Field(
Expand Down
6 changes: 1 addition & 5 deletions backend/src/app/core/data/dto/source_document_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,4 @@ class SourceDocumentDataRead(SourceDocumentDataBase):

# Properties for creation
class SourceDocumentDataCreate(SourceDocumentDataBase):
word_frequencies: str = Field(
description=(
"JSON Representation of List[WordFrequency] of the SourceDocument"
),
)
pass
7 changes: 0 additions & 7 deletions backend/src/app/core/data/orm/source_document_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,6 @@ class SourceDocumentDataORM(ORMBase):
sentence_ends: Mapped[List[int]] = mapped_column(
ARRAY(Integer), nullable=False, index=False
)
# JSON representation of List[{word: str, count: int}]
word_frequencies: Mapped[str] = mapped_column(
String,
server_default="[]",
nullable=False,
index=False,
)

@property
def tokens(self):
Expand Down
82 changes: 0 additions & 82 deletions backend/src/app/core/search/elasticsearch_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from app.core.data.dto.search import (
ElasticSearchDocumentCreate,
ElasticSearchDocumentHit,
ElasticSearchDocumentRead,
ElasticSearchMemoCreate,
ElasticSearchMemoRead,
ElasticSearchMemoUpdate,
Expand All @@ -18,16 +17,6 @@
from config import conf


class NoSuchSourceDocumentInElasticSearchError(Exception):
def __init__(self, proj_id: int, sdoc_id: int):
super().__init__(
(
f"There exists no SourceDocument with ID={sdoc_id} in Project {proj_id}"
" in the respective ElasticSearch Index!"
)
)


class NoSuchMemoInElasticSearchError(Exception):
def __init__(self, proj_id: int, memo_id: int):
super().__init__(
Expand Down Expand Up @@ -203,77 +192,6 @@ def add_document_to_index(
)
return res["_id"]

def bulk_add_documents_to_index(
self, *, proj_id: int, esdocs: List[ElasticSearchDocumentCreate]
) -> int:
idx_name = self.__get_index_name(proj_id=proj_id, index_type="doc")

def generate_actions_for_bulk_index():
for esdoc in esdocs:
doc = {
"_index": idx_name,
"_id": esdoc.sdoc_id,
"_source": esdoc.model_dump(),
}
yield doc

num_indexed, num_errors = helpers.bulk(
client=self.__client,
actions=generate_actions_for_bulk_index(),
stats_only=True,
)
logger.debug(
f"Added {num_indexed} Documents to ElasticSearch Index {idx_name} with {num_errors} Errors!"
)
return num_indexed

def get_esdocs_by_sdoc_ids(
self, *, proj_id: int, sdoc_ids: Set[int], fields: Set[str]
) -> List[ElasticSearchDocumentRead]:
if not fields.union(self.doc_index_fields):
raise NoSuchFieldInIndexError(
index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
fields=fields,
index_fields=self.doc_index_fields,
)
results = self.__client.mget(
index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
_source=list(fields),
body={"ids": list(sdoc_ids)},
)

esdocs = []
for res in results["docs"]:
if not res["found"]:
raise NoSuchSourceDocumentInElasticSearchError(
proj_id=proj_id, sdoc_id=res["_id"]
)
esdocs.append(
ElasticSearchDocumentRead(sdoc_id=res["_id"], **res["_source"])
)

return esdocs

def get_esdoc_by_sdoc_id(
self, *, proj_id: int, sdoc_id: int, fields: Set[str]
) -> Optional[ElasticSearchDocumentRead]:
if not fields.union(self.doc_index_fields):
raise NoSuchFieldInIndexError(
index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
fields=fields,
index_fields=self.doc_index_fields,
)
res = self.__client.get(
index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
id=str(sdoc_id),
_source=list(fields),
)
if not res["found"]:
raise NoSuchSourceDocumentInElasticSearchError(
proj_id=proj_id, sdoc_id=sdoc_id
)
return ElasticSearchDocumentRead(**res["_source"])

def delete_document_from_index(self, proj_id: int, sdoc_id: int) -> None:
self.__client.delete(
index=self.__get_index_name(proj_id=proj_id, index_type="doc"),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from app.core.data.dto.search import (
ElasticSearchDocumentCreate,
ElasticSearchIntegerRange,
)
from app.core.search.elasticsearch_service import ElasticSearchService
from app.preprocessing.pipeline.model.pipeline_cargo import PipelineCargo
Expand All @@ -18,17 +17,6 @@ def store_document_in_elasticsearch(cargo: PipelineCargo) -> PipelineCargo:
esdoc = ElasticSearchDocumentCreate(
filename=pptd.filename,
content=pptd.text,
html=pptd.html,
tokens=pptd.tokens,
token_character_offsets=[
ElasticSearchIntegerRange(gte=o[0], lt=o[1])
for o in pptd.token_character_offsets
],
sentences=[s.text for s in pptd.sentences],
sentence_character_offsets=[
ElasticSearchIntegerRange(gte=s.start, lt=s.end) for s in pptd.sentences
],
keywords=pptd.keywords,
sdoc_id=sdoc_id,
project_id=pptd.project_id,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import traceback

import srsly
from loguru import logger
from psycopg2 import OperationalError
from sqlalchemy.orm import Session
Expand Down Expand Up @@ -53,10 +52,6 @@ def _create_and_persist_sdoc(db: Session, pptd: PreProTextDoc) -> SourceDocument
def _persist_sdoc_data(
db: Session, sdoc_db_obj: SourceDocumentORM, pptd: PreProTextDoc
) -> None:
word_frequencies_str = srsly.json_dumps(
[{"word": word, "count": count} for word, count in pptd.word_freqs.items()]
)

sdoc_data = SourceDocumentDataCreate(
id=sdoc_db_obj.id,
content=pptd.text,
Expand All @@ -65,7 +60,6 @@ def _persist_sdoc_data(
token_ends=[e for _, e in pptd.token_character_offsets],
sentence_starts=[s.start for s in pptd.sentences],
sentence_ends=[s.end for s in pptd.sentences],
word_frequencies=word_frequencies_str,
)
crud_sdoc_data.create(db=db, create_dto=sdoc_data)

Expand Down
Loading
Loading