Skip to content

Commit

Permalink
Merge pull request #351 from uhh-lt/fix-duplicate-finder
Browse files Browse the repository at this point in the history
Fix duplicate finder
  • Loading branch information
bigabig authored Feb 11, 2024
2 parents 7e7044a + f2e1e8c commit e6b8d04
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 6 deletions.
4 changes: 2 additions & 2 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@
"name": "Alembic: check",
"type": "node-terminal",
"request": "launch",
"command": "alembic check",
"command": "micromamba activate dwts && alembic check",
"cwd": "${workspaceFolder}/backend/src",
"envFile": "${workspaceFolder}/backend/.env"
},
{
"name": "Alembic: revision",
"type": "node-terminal",
"request": "launch",
"command": "alembic revision --autogenerate -m \"vscode launcher\"",
"command": "micromamba activate dwts && alembic revision --autogenerate -m \"vscode launcher\"",
"cwd": "${workspaceFolder}/backend/src",
"envFile": "${workspaceFolder}/backend/.env"
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""add word_frequencies to SourceDocumentData
Revision ID: b0ac316511e1
Revises: 3bd76cc03486
Create Date: 2024-02-10 17:50:19.307561
"""
from typing import Sequence, Union

import sqlalchemy as sa

from alembic import op

# revision identifiers, used by Alembic.
revision: str = "b9de10411f61"
down_revision: Union[str, None] = "3bd76cc03486"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.add_column(
"sourcedocumentdata",
sa.Column("word_frequencies", sa.String(), server_default="[]", nullable=False),
)


def downgrade() -> None:
op.drop_column("sourcedocumentdata", "word_frequencies")
21 changes: 18 additions & 3 deletions backend/src/app/core/analysis/duplicate_finder_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@

import networkx as nx
import numpy as np
import srsly
from loguru import logger
from scipy import sparse
from sklearn.metrics.pairwise import manhattan_distances

from app.core.data.doc_type import DocType
from app.core.data.dto.word_frequency import WordFrequencyRead
from app.core.data.orm.source_document import SourceDocumentORM
from app.core.data.orm.word_frequency import WordFrequencyORM
from app.core.data.orm.source_document_data import SourceDocumentDataORM
from app.core.db.sql_service import SQLService
from app.util.singleton_meta import SingletonMeta

Expand All @@ -26,8 +28,12 @@ def find_duplicate_text_sdocs(
t0 = time.time()
with self.sqls.db_session() as db:
result = (
db.query(WordFrequencyORM)
.join(WordFrequencyORM.source_document)
db.query(
SourceDocumentDataORM.id, SourceDocumentDataORM.word_frequencies
)
.join(
SourceDocumentORM, SourceDocumentORM.id == SourceDocumentDataORM.id
)
.filter(
SourceDocumentORM.project_id == project_id,
SourceDocumentORM.doctype == DocType.text,
Expand All @@ -37,6 +43,15 @@ def find_duplicate_text_sdocs(
t1 = time.time()
logger.info(f"query took: {t1 - t0}")

t0 = time.time()
result = [
WordFrequencyRead(sdoc_id=int(row[0]), **wf)
for row in result
for wf in srsly.json_loads(row[1])
]
t1 = time.time()
logger.info(f"convert took: {t1 - t0}")

t0 = time.time()
# unique words in project
words = set([r.word.lower() for r in result])
Expand Down
6 changes: 5 additions & 1 deletion backend/src/app/core/data/dto/source_document_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,8 @@ class SourceDocumentDataRead(SourceDocumentDataBase):

# Properties for creation
class SourceDocumentDataCreate(SourceDocumentDataBase):
pass
word_frequencies: str = Field(
description=(
"JSON Representation of List[WordFrequency] of the SourceDocument"
),
)
7 changes: 7 additions & 0 deletions backend/src/app/core/data/orm/source_document_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ class SourceDocumentDataORM(ORMBase):
sentence_ends: Mapped[List[int]] = mapped_column(
ARRAY(Integer), nullable=False, index=False
)
# JSON representation of List[{word: str, count: int}]
word_frequencies: Mapped[str] = mapped_column(
String,
server_default="[]",
nullable=False,
index=False,
)

@property
def tokens(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import traceback

import srsly
from loguru import logger
from psycopg2 import OperationalError
from sqlalchemy.orm import Session
Expand Down Expand Up @@ -52,6 +53,10 @@ def _create_and_persist_sdoc(db: Session, pptd: PreProTextDoc) -> SourceDocument
def _persist_sdoc_data(
db: Session, sdoc_db_obj: SourceDocumentORM, pptd: PreProTextDoc
) -> None:
word_frequencies_str = srsly.json_dumps(
[{"word": word, "count": count} for word, count in pptd.word_freqs.items()]
)

sdoc_data = SourceDocumentDataCreate(
id=sdoc_db_obj.id,
content=pptd.text,
Expand All @@ -60,6 +65,7 @@ def _persist_sdoc_data(
token_ends=[e for _, e in pptd.token_character_offsets],
sentence_starts=[s.start for s in pptd.sentences],
sentence_ends=[s.end for s in pptd.sentences],
word_frequencies=word_frequencies_str,
)
crud_sdoc_data.create(db=db, create_dto=sdoc_data)

Expand Down
53 changes: 53 additions & 0 deletions backend/src/migration/migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from alembic.command import upgrade
from alembic.config import Config
from app.core.data.crud.crud_base import NoSuchElementError
from app.core.data.crud.project_metadata import crud_project_meta
from app.core.data.crud.source_document_data import crud_sdoc_data
from app.core.data.crud.source_document_metadata import crud_sdoc_meta
Expand Down Expand Up @@ -75,6 +76,11 @@ def run_required_migrations():
db_version.version = 8
db.commit()
print("MIGRATED IMAGE WIDTH HEIGHT!")
if db_version.version < 9:
__migrate_word_frequencies(db)
db_version.version = 9
db.commit()
print("MIGRATED WORD FREQUENCIES!")


def __migrate_database_schema() -> None:
Expand Down Expand Up @@ -439,3 +445,50 @@ def __migrate_image_width_height(db: Session):
db.add(height_pm)

db.commit()


def __migrate_word_frequencies(db: Session):
import srsly

from app.core.data.dto.word_frequency import WordFrequencyRead
from app.core.data.orm.word_frequency import WordFrequencyORM

projects = db.query(ProjectORM).all()
for project in projects:
logger.info(
"Migration: Migrating word_frequencies project {}...",
project.id,
)

sdoc_ids = (
db.query(SourceDocumentORM.id)
.filter(
SourceDocumentORM.project_id == project.id,
SourceDocumentORM.doctype == DocType.text,
)
.all()
)
sdoc_ids = [sdoc_id[0] for sdoc_id in sdoc_ids]

for sdoc_id in sdoc_ids:
result = (
db.query(WordFrequencyORM)
.filter(
WordFrequencyORM.sdoc_id == sdoc_id,
)
.all()
)
word_frequencies = [WordFrequencyRead.model_validate(row) for row in result]
word_frequencies_str = srsly.json_dumps(
[{"word": wf.word, "count": wf.count} for wf in word_frequencies]
)

# update SourceDocumentData
try:
db_obj = crud_sdoc_data.read(db=db, id=sdoc_id)
except NoSuchElementError:
continue
setattr(db_obj, "word_frequencies", word_frequencies_str)
db.add(db_obj)

db.commit()

0 comments on commit e6b8d04

Please sign in to comment.