Skip to content

Commit

Permalink
feat: make models be downloaded at runtime instead of buildtime
Browse files Browse the repository at this point in the history
This commit updates the way models are loaded so that they are now
searched on a mounted volume instead of attempting to load them as a
python module, and thus removes their installation from the respective
Dockerfile.
  • Loading branch information
sergiolaverde0 committed Jan 27, 2024
1 parent b155c45 commit 47698a9
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 31 deletions.
3 changes: 1 addition & 2 deletions docker/PythonDockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ RUN pip install -U --no-cache-dir \
#chinese reading
pinyin

RUN python3 -m spacy download ja_core_news_sm \
&& python3 -m spacy download ko_core_news_sm \
RUN python3 -m spacy download ko_core_news_sm \
&& python3 -m spacy download de_core_news_sm \
&& python3 -m spacy download nb_core_news_sm \
&& python3 -m spacy download es_core_news_sm \
Expand Down
58 changes: 29 additions & 29 deletions tools/django_server/tokenizer/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import spacy
import time
import re
import ebooklib
import ebooklib
import html
import pinyin
from ebooklib import epub
Expand All @@ -16,7 +16,7 @@
from urllib import parse

@Language.component("custom_sentence_splitter")
def custom_sentence_splitter(doc):
def custom_sentence_splitter(doc):
punctuations = ['NEWLINE', '?', '!', '。', '?', '!', '.', '»', '«']
for token in doc[:-1]:
if token.text in punctuations:
Expand All @@ -30,7 +30,7 @@ def custom_sentence_splitter(doc):
multi_nlp = spacy.load("xx_ent_wiki_sm", disable = ['ner'])
multi_nlp.add_pipe("custom_sentence_splitter", first=True)

japanese_nlp = spacy.load("ja_core_news_sm", disable = ['ner', 'parser'])
japanese_nlp = spacy.load("/var/www/html/storage/app/model/ja_core_news_sm-3.7.0", disable = ['ner', 'parser'])
japanese_nlp.add_pipe("custom_sentence_splitter", first=True)
hiraganaConverter = pykakasi.kakasi()

Expand Down Expand Up @@ -96,7 +96,7 @@ def tokenizer(request):
tokenizedText.append(tokenizeText(text, language))
return HttpResponse(json.dumps(tokenizedText), content_type="application/json")

# Cuts a text into sentences and words. Works like
# Cuts a text into sentences and words. Works like
# tokenizer, but provides no additional data for words.
def tokenizeTextSimple(words, language):
tokenizedWords = list()
Expand All @@ -107,7 +107,7 @@ def tokenizeTextSimple(words, language):
words = words.replace(sentenceEnding, sentenceEnding + 'TMP_ST')

sentences = words.split('TMP_ST')

wordIndex = 0
for sentenceIndex, sentence in enumerate(sentences):
# split sentences into words
Expand All @@ -119,18 +119,18 @@ def tokenizeTextSimple(words, language):
for word in sentences[sentenceIndex]:
if word == ' ' or word == '' or word == ' ':
continue

tokenizedWords.append({'w': word, 'r': '', 'l': '', 'lr': '', 'pos': '','si': sentenceIndex, 'g': ''})
wordIndex = wordIndex + 1

return tokenizedWords

# Tokenizes a text with spacy.
def tokenizeText(words, language):
tokenizedWords = list()
if language == 'german':
doc = german_nlp(words)

if language == 'japanese':
doc = japanese_nlp(words)

Expand All @@ -145,16 +145,16 @@ def tokenizeText(words, language):

if language == 'chinese':
doc = chinese_nlp(words)

if language == 'dutch':
doc = dutch_nlp(words)

if language == 'finnish':
doc = finnish_nlp(words)

if language == 'french':
doc = french_nlp(words)

if language == 'italian':
doc = italian_nlp(words)

Expand All @@ -175,22 +175,22 @@ def tokenizeText(words, language):
word = str(token.text)
if word == ' ' or word == '' or word == ' ':
continue

#get lemma
lemma = token.lemma_

#get hiragana reading
reading = list()
lemmaReading = list()
if language == 'japanese':
result = hiraganaConverter.convert(token.text)
for x in result:
reading.append(x['hira'])

result = hiraganaConverter.convert(token.lemma_)
for x in result:
lemmaReading.append(x['hira'])

reading = ''.join(reading)
lemmaReading = ''.join(lemmaReading)

Expand All @@ -206,7 +206,7 @@ def tokenizeText(words, language):

if language == 'german' and token.pos_ == 'VERB':
lemma = get_separable_lemma(token)

tokenizedWords.append({'w': word, 'r': reading, 'l': lemma, 'lr': lemmaReading, 'pos': token.pos_,'si': sentenceIndex, 'g': gender})
return tokenizedWords

Expand All @@ -219,8 +219,8 @@ def get_separable_lemma(token):

# loads n .epub file
def loadBook(file):
htmlPattern = re.compile('<.*?>')
htmlPattern = re.compile('<.*?>')

book = epub.read_epub(file)
content = ''

Expand All @@ -235,11 +235,11 @@ def loadBook(file):

return str(content)

# returns a raw text and a tokenized text
# returns a raw text and a tokenized text
# of n .epub file cut into chunks
def importBook(request):
postData = json.loads(request.body)

# load book
content = loadBook(postData['importFile'])
content = content.replace('\r\n', ' NEWLINE ')
Expand Down Expand Up @@ -276,7 +276,7 @@ def importBook(request):
# cuts the text given in post data into chunks, and tokenizes them
def importText(request):
postData = json.loads(request.body)

# load text
text = postData['importText']
text = text.replace('\r\n', ' NEWLINE ')
Expand Down Expand Up @@ -312,23 +312,23 @@ def importText(request):

def getYoutubeSubtitles(request):
postData = json.loads(request.body)

parsedUrl = parse.urlparse(postData['url'])
videoId = parse.parse_qs(parsedUrl.query)['v'][0]

try:
subtitles = YouTubeTranscriptApi.list_transcripts(videoId)
except TranscriptsDisabled:
except TranscriptsDisabled:
return HttpResponse(json.dumps(list()), content_type="application/json")

subtitleList = list()
for subtitle in subtitles:
subtitleList.append({
'language': subtitle.language,
'languageLowerCase': subtitle.language.lower(),
'languageCode': subtitle.language_code,
'language': subtitle.language,
'languageLowerCase': subtitle.language.lower(),
'languageCode': subtitle.language_code,
'text': '\n'.join(line['text'] for line in subtitle.fetch())
})

return HttpResponse(json.dumps(subtitleList), content_type="application/json")

return HttpResponse(json.dumps(subtitleList), content_type="application/json")

0 comments on commit 47698a9

Please sign in to comment.