Skip to content

Commit

Permalink
improved duplicate finder performance
Browse files Browse the repository at this point in the history
  • Loading branch information
bigabig committed Feb 11, 2024
1 parent 555f895 commit f2e1e8c
Showing 1 changed file with 18 additions and 3 deletions.
21 changes: 18 additions & 3 deletions backend/src/app/core/analysis/duplicate_finder_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@

import networkx as nx
import numpy as np
import srsly
from loguru import logger
from scipy import sparse
from sklearn.metrics.pairwise import manhattan_distances

from app.core.data.doc_type import DocType
from app.core.data.dto.word_frequency import WordFrequencyRead
from app.core.data.orm.source_document import SourceDocumentORM
from app.core.data.orm.word_frequency import WordFrequencyORM
from app.core.data.orm.source_document_data import SourceDocumentDataORM
from app.core.db.sql_service import SQLService
from app.util.singleton_meta import SingletonMeta

Expand All @@ -26,8 +28,12 @@ def find_duplicate_text_sdocs(
t0 = time.time()
with self.sqls.db_session() as db:
result = (
db.query(WordFrequencyORM)
.join(WordFrequencyORM.source_document)
db.query(
SourceDocumentDataORM.id, SourceDocumentDataORM.word_frequencies
)
.join(
SourceDocumentORM, SourceDocumentORM.id == SourceDocumentDataORM.id
)
.filter(
SourceDocumentORM.project_id == project_id,
SourceDocumentORM.doctype == DocType.text,
Expand All @@ -37,6 +43,15 @@ def find_duplicate_text_sdocs(
t1 = time.time()
logger.info(f"query took: {t1 - t0}")

t0 = time.time()
result = [
WordFrequencyRead(sdoc_id=int(row[0]), **wf)
for row in result
for wf in srsly.json_loads(row[1])
]
t1 = time.time()
logger.info(f"convert took: {t1 - t0}")

t0 = time.time()
# unique words in project
words = set([r.word.lower() for r in result])
Expand Down

0 comments on commit f2e1e8c

Please sign in to comment.