Skip to content

Commit

Permalink
Start embedding service faster by including installation step in build (
Browse files Browse the repository at this point in the history
  • Loading branch information
pal03377 authored Aug 2, 2023
1 parent 7b1b9b4 commit 2dbf9c8
Show file tree
Hide file tree
Showing 7 changed files with 21 additions and 6 deletions.
4 changes: 2 additions & 2 deletions clustering/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ LABEL author="Jan Philip Bernius <[email protected]>"

COPY ./clustering/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir -qr /tmp/requirements.txt
# additional flag needed in hdbscan 0.8.29, see https://github.com/scikit-learn-contrib/hdbscan/issues/457
RUN pip install --no-cache-dir hdbscan==0.8.29 --no-binary :all: --use-feature=no-binary-enable-wheel-cache
# additional flag needed since hdbscan 0.8.29, see https://github.com/scikit-learn-contrib/hdbscan/issues/457
RUN pip install --no-cache-dir hdbscan==0.8.33 --no-binary :all: --use-feature=no-binary-enable-wheel-cache

WORKDIR /usr/src/app
COPY ./clustering/src/ src/
Expand Down
2 changes: 2 additions & 0 deletions clustering/src/TimerHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from src.ProcessingResource import ProcessingResource
import os
import threading
import traceback

process_lock = threading.Lock() # Lock to prevent multiple calculations and restart during calculation
# Interval to query task queue (in seconds)
Expand Down Expand Up @@ -47,6 +48,7 @@ def run(self):
# Query task queue after timeout again
is_killed = self._kill.wait(self._interval)
except Exception as e:
traceback.print_exc()
self.__logger.error("Exception while processing: {}".format(str(e)))
process_lock.release()
# Query task queue after timeout again
Expand Down
4 changes: 4 additions & 0 deletions embedding/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ RUN mkdir -p /usr/lib/nltk_data \
# we need to upgrade numpy again for it to be detected by pytorch later
RUN pip install --no-cache-dir numpy==1.24.1

# install spacy dependencies
COPY ./embedding/Makefile ./
RUN make -C . spacy

COPY ./embedding/src/ src/
COPY ./text_preprocessing src/text_preprocessing

Expand Down
5 changes: 4 additions & 1 deletion embedding/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!make

all: .venv text_preprocessing ./.venv/nltk_data models
all: .venv text_preprocessing ./.venv/nltk_data models spacy

.venv: requirements.txt
python -m venv .venv
Expand All @@ -16,6 +16,9 @@ text_preprocessing: ../text_preprocessing
models: ./src/resources/models/Makefile
@$(MAKE) -C ./src/resources/models

spacy:
source .venv/bin/activate; python -m spacy download en_core_web_sm

start:
source .venv/bin/activate; python start.py

Expand Down
2 changes: 2 additions & 0 deletions embedding/src/TimerHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from src.ProcessingResource import ProcessingResource
import os
import threading
import traceback

process_lock = threading.Lock() # Lock to prevent multiple calculations and restart during calculation
# Interval to query task queue (in seconds)
Expand Down Expand Up @@ -47,6 +48,7 @@ def run(self):
# Query task queue after timeout again
is_killed = self._kill.wait(self._interval)
except Exception as e:
traceback.print_exc()
self.__logger.error("Exception while processing: {}".format(str(e)))
process_lock.release()
# Query task queue after timeout again
Expand Down
8 changes: 5 additions & 3 deletions embedding/src/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import spacy
# Download the model so that importing it will work
# see https://stackoverflow.com/a/47297686/4306257
spacy.cli.download("en_core_web_sm")
if not spacy.util.is_package("en_core_web_sm"):
# Download the model so that importing it will work
# see https://stackoverflow.com/a/47297686/4306257
print("Downloading the spaCy English model (not installed yet)...")
spacy.cli.download("en_core_web_sm")

# Patch import
from .patch.patch_spacy_tags import TAG_MAP
Expand Down
2 changes: 2 additions & 0 deletions segmentation/src/TimerHandler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from src.ProcessingResource import ProcessingResource
import os
import threading
import traceback

process_lock = threading.Lock() # Lock to prevent multiple calculations and restart during calculation
# Interval to query task queue (in seconds)
Expand Down Expand Up @@ -48,6 +49,7 @@ def run(self):
# Query task queue after timeout again
is_killed = self._kill.wait(self._interval)
except Exception as e:
traceback.print_exc()
self.__logger.error("Exception while processing: {}".format(str(e)))
process_lock.release()
# Query task queue after timeout again
Expand Down

0 comments on commit 2dbf9c8

Please sign in to comment.