diff --git a/docker/PythonDockerfile b/docker/PythonDockerfile index 21cfff6f..f87c8780 100644 --- a/docker/PythonDockerfile +++ b/docker/PythonDockerfile @@ -24,8 +24,7 @@ RUN pip install -U --no-cache-dir \ #chinese reading pinyin -RUN python3 -m spacy download ja_core_news_sm \ - && python3 -m spacy download ko_core_news_sm \ +RUN python3 -m spacy download ko_core_news_sm \ && python3 -m spacy download de_core_news_sm \ && python3 -m spacy download nb_core_news_sm \ && python3 -m spacy download es_core_news_sm \ diff --git a/tools/django_server/tokenizer/views.py b/tools/django_server/tokenizer/views.py index fc25944f..8b8c6f6a 100755 --- a/tools/django_server/tokenizer/views.py +++ b/tools/django_server/tokenizer/views.py @@ -6,7 +6,7 @@ import spacy import time import re -import ebooklib +import ebooklib import html import pinyin from ebooklib import epub @@ -16,7 +16,7 @@ from urllib import parse @Language.component("custom_sentence_splitter") -def custom_sentence_splitter(doc): +def custom_sentence_splitter(doc): punctuations = ['NEWLINE', '?', '!', '。', '?', '!', '.', '»', '«'] for token in doc[:-1]: if token.text in punctuations: @@ -30,7 +30,7 @@ def custom_sentence_splitter(doc): multi_nlp = spacy.load("xx_ent_wiki_sm", disable = ['ner']) multi_nlp.add_pipe("custom_sentence_splitter", first=True) -japanese_nlp = spacy.load("ja_core_news_sm", disable = ['ner', 'parser']) +japanese_nlp = spacy.load("/var/www/html/storage/app/model/ja_core_news_sm-3.7.0", disable = ['ner', 'parser']) japanese_nlp.add_pipe("custom_sentence_splitter", first=True) hiraganaConverter = pykakasi.kakasi() @@ -96,7 +96,7 @@ def tokenizer(request): tokenizedText.append(tokenizeText(text, language)) return HttpResponse(json.dumps(tokenizedText), content_type="application/json") -# Cuts a text into sentences and words. Works like +# Cuts a text into sentences and words. Works like # tokenizer, but provides no additional data for words. def tokenizeTextSimple(words, language): tokenizedWords = list() @@ -107,7 +107,7 @@ def tokenizeTextSimple(words, language): words = words.replace(sentenceEnding, sentenceEnding + 'TMP_ST') sentences = words.split('TMP_ST') - + wordIndex = 0 for sentenceIndex, sentence in enumerate(sentences): # split sentences into words @@ -119,10 +119,10 @@ def tokenizeTextSimple(words, language): for word in sentences[sentenceIndex]: if word == ' ' or word == '' or word == ' ': continue - + tokenizedWords.append({'w': word, 'r': '', 'l': '', 'lr': '', 'pos': '','si': sentenceIndex, 'g': ''}) wordIndex = wordIndex + 1 - + return tokenizedWords # Tokenizes a text with spacy. @@ -130,7 +130,7 @@ def tokenizeText(words, language): tokenizedWords = list() if language == 'german': doc = german_nlp(words) - + if language == 'japanese': doc = japanese_nlp(words) @@ -145,16 +145,16 @@ def tokenizeText(words, language): if language == 'chinese': doc = chinese_nlp(words) - + if language == 'dutch': doc = dutch_nlp(words) - + if language == 'finnish': doc = finnish_nlp(words) - + if language == 'french': doc = french_nlp(words) - + if language == 'italian': doc = italian_nlp(words) @@ -175,10 +175,10 @@ def tokenizeText(words, language): word = str(token.text) if word == ' ' or word == '' or word == ' ': continue - + #get lemma lemma = token.lemma_ - + #get hiragana reading reading = list() lemmaReading = list() @@ -186,11 +186,11 @@ def tokenizeText(words, language): result = hiraganaConverter.convert(token.text) for x in result: reading.append(x['hira']) - + result = hiraganaConverter.convert(token.lemma_) for x in result: lemmaReading.append(x['hira']) - + reading = ''.join(reading) lemmaReading = ''.join(lemmaReading) @@ -206,7 +206,7 @@ def tokenizeText(words, language): if language == 'german' and token.pos_ == 'VERB': lemma = get_separable_lemma(token) - + tokenizedWords.append({'w': word, 'r': reading, 'l': lemma, 'lr': lemmaReading, 'pos': token.pos_,'si': sentenceIndex, 'g': gender}) return tokenizedWords @@ -219,8 +219,8 @@ def get_separable_lemma(token): # loads n .epub file def loadBook(file): - htmlPattern = re.compile('<.*?>') - + htmlPattern = re.compile('<.*?>') + book = epub.read_epub(file) content = '' @@ -235,11 +235,11 @@ def loadBook(file): return str(content) -# returns a raw text and a tokenized text +# returns a raw text and a tokenized text # of n .epub file cut into chunks def importBook(request): postData = json.loads(request.body) - + # load book content = loadBook(postData['importFile']) content = content.replace('\r\n', ' NEWLINE ') @@ -276,7 +276,7 @@ def importBook(request): # cuts the text given in post data into chunks, and tokenizes them def importText(request): postData = json.loads(request.body) - + # load text text = postData['importText'] text = text.replace('\r\n', ' NEWLINE ') @@ -312,23 +312,23 @@ def importText(request): def getYoutubeSubtitles(request): postData = json.loads(request.body) - + parsedUrl = parse.urlparse(postData['url']) videoId = parse.parse_qs(parsedUrl.query)['v'][0] try: subtitles = YouTubeTranscriptApi.list_transcripts(videoId) - except TranscriptsDisabled: + except TranscriptsDisabled: return HttpResponse(json.dumps(list()), content_type="application/json") subtitleList = list() for subtitle in subtitles: subtitleList.append({ - 'language': subtitle.language, - 'languageLowerCase': subtitle.language.lower(), - 'languageCode': subtitle.language_code, + 'language': subtitle.language, + 'languageLowerCase': subtitle.language.lower(), + 'languageCode': subtitle.language_code, 'text': '\n'.join(line['text'] for line in subtitle.fetch()) }) - - return HttpResponse(json.dumps(subtitleList), content_type="application/json") \ No newline at end of file + + return HttpResponse(json.dumps(subtitleList), content_type="application/json")