feat: make models be downloaded at runtime instead of buildtime

This commit updates the way models are loaded so that they are now searched on a mounted volume instead of attempting to load them as a python module, and thus removes their installation from the respective Dockerfile.
sergiolaverde0 · Jan 27, 2024 · 47698a9 · 47698a9
1 parent b155c45
commit 47698a9
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 31 deletions.
diff --git a/docker/PythonDockerfile b/docker/PythonDockerfile
@@ -24,8 +24,7 @@ RUN pip install -U --no-cache-dir \
 #chinese reading
         pinyin
 
-RUN python3 -m spacy download ja_core_news_sm \
-    && python3 -m spacy download ko_core_news_sm \
+RUN python3 -m spacy download ko_core_news_sm \
     && python3 -m spacy download de_core_news_sm \
     && python3 -m spacy download nb_core_news_sm \
     && python3 -m spacy download es_core_news_sm \

diff --git a/tools/django_server/tokenizer/views.py b/tools/django_server/tokenizer/views.py
@@ -6,7 +6,7 @@
 import spacy
 import time
 import re
-import ebooklib 
+import ebooklib
 import html
 import pinyin
 from ebooklib import epub
@@ -16,7 +16,7 @@
 from urllib import parse
 
 @Language.component("custom_sentence_splitter")
-def custom_sentence_splitter(doc):    
+def custom_sentence_splitter(doc):
     punctuations = ['NEWLINE', '？', '！', '。', '?', '!', '.', '»', '«']
     for token in doc[:-1]:
         if token.text in punctuations:
@@ -30,7 +30,7 @@ def custom_sentence_splitter(doc):
 multi_nlp = spacy.load("xx_ent_wiki_sm", disable = ['ner'])
 multi_nlp.add_pipe("custom_sentence_splitter", first=True)
 
-japanese_nlp = spacy.load("ja_core_news_sm", disable = ['ner', 'parser'])
+japanese_nlp = spacy.load("/var/www/html/storage/app/model/ja_core_news_sm-3.7.0", disable = ['ner', 'parser'])
 japanese_nlp.add_pipe("custom_sentence_splitter", first=True)
 hiraganaConverter = pykakasi.kakasi()
 
@@ -96,7 +96,7 @@ def tokenizer(request):
             tokenizedText.append(tokenizeText(text, language))
         return HttpResponse(json.dumps(tokenizedText), content_type="application/json")
 
-# Cuts a text into sentences and words. Works like 
+# Cuts a text into sentences and words. Works like
 # tokenizer, but provides no additional data for words.
 def tokenizeTextSimple(words, language):
     tokenizedWords = list()
@@ -107,7 +107,7 @@ def tokenizeTextSimple(words, language):
         words = words.replace(sentenceEnding, sentenceEnding + 'TMP_ST')
 
     sentences = words.split('TMP_ST')
-    
+
     wordIndex = 0
     for sentenceIndex, sentence in enumerate(sentences):
         # split sentences into words
@@ -119,18 +119,18 @@ def tokenizeTextSimple(words, language):
         for word in sentences[sentenceIndex]:
             if word == ' ' or word == '' or word == ' ':
                 continue
-            
+
             tokenizedWords.append({'w': word, 'r': '', 'l': '', 'lr': '', 'pos': '','si': sentenceIndex, 'g': ''})
             wordIndex = wordIndex + 1
-    
+
     return tokenizedWords
 
 # Tokenizes a text with spacy.
 def tokenizeText(words, language):
     tokenizedWords = list()
     if language == 'german':
         doc = german_nlp(words)
-        
+
     if language == 'japanese':
         doc = japanese_nlp(words)
 
@@ -145,16 +145,16 @@ def tokenizeText(words, language):
 
     if language == 'chinese':
         doc = chinese_nlp(words)
-    
+
     if language == 'dutch':
         doc = dutch_nlp(words)
-    
+
     if language == 'finnish':
         doc = finnish_nlp(words)
-    
+
     if language == 'french':
         doc = french_nlp(words)
-    
+
     if language == 'italian':
         doc = italian_nlp(words)
 
@@ -175,22 +175,22 @@ def tokenizeText(words, language):
             word = str(token.text)
             if word == ' ' or word == '' or word == ' ':
                 continue
-            
+
             #get lemma
             lemma = token.lemma_
-            
+
             #get hiragana reading
             reading = list()
             lemmaReading = list()
             if language == 'japanese':
                 result = hiraganaConverter.convert(token.text)
                 for x in result:
                     reading.append(x['hira'])
-                
+
                 result = hiraganaConverter.convert(token.lemma_)
                 for x in result:
                     lemmaReading.append(x['hira'])
-            
+
                 reading = ''.join(reading)
                 lemmaReading = ''.join(lemmaReading)
 
@@ -206,7 +206,7 @@ def tokenizeText(words, language):
 
             if language == 'german' and token.pos_ == 'VERB':
                 lemma = get_separable_lemma(token)
-            
+
             tokenizedWords.append({'w': word, 'r': reading, 'l': lemma, 'lr': lemmaReading, 'pos': token.pos_,'si': sentenceIndex, 'g': gender})
     return tokenizedWords
 
@@ -219,8 +219,8 @@ def get_separable_lemma(token):
 
 # loads n .epub file
 def loadBook(file):
-    htmlPattern = re.compile('<.*?>') 
-    
+    htmlPattern = re.compile('<.*?>')
+
     book = epub.read_epub(file)
     content = ''
 
@@ -235,11 +235,11 @@ def loadBook(file):
 
     return str(content)
 
-# returns a raw text and a tokenized text 
+# returns a raw text and a tokenized text
 # of n .epub file cut into chunks
 def importBook(request):
     postData = json.loads(request.body)
-    
+
     # load book
     content = loadBook(postData['importFile'])
     content = content.replace('\r\n', ' NEWLINE ')
@@ -276,7 +276,7 @@ def importBook(request):
 # cuts the text given in post data into chunks, and tokenizes them
 def importText(request):
     postData = json.loads(request.body)
-    
+
     # load text
     text = postData['importText']
     text = text.replace('\r\n', ' NEWLINE ')
@@ -312,23 +312,23 @@ def importText(request):
 
 def getYoutubeSubtitles(request):
     postData = json.loads(request.body)
-    
+
     parsedUrl = parse.urlparse(postData['url'])
     videoId = parse.parse_qs(parsedUrl.query)['v'][0]
 
     try:
         subtitles = YouTubeTranscriptApi.list_transcripts(videoId)
-    except TranscriptsDisabled: 
+    except TranscriptsDisabled:
         return HttpResponse(json.dumps(list()), content_type="application/json")
 
     subtitleList = list()
     for subtitle in subtitles:
         subtitleList.append({
-            'language': subtitle.language, 
-            'languageLowerCase': subtitle.language.lower(), 
-            'languageCode': subtitle.language_code, 
+            'language': subtitle.language,
+            'languageLowerCase': subtitle.language.lower(),
+            'languageCode': subtitle.language_code,
             'text': '\n'.join(line['text'] for line in subtitle.fetch())
         })
 
-    
-    return HttpResponse(json.dumps(subtitleList), content_type="application/json")
+
+    return HttpResponse(json.dumps(subtitleList), content_type="application/json")