Skip to content

Commit

Permalink
Merge pull request #337 from sergiolaverde0/turkish
Browse files Browse the repository at this point in the history
Temporary fix for Turkish
  • Loading branch information
simjanos-dev authored Aug 26, 2024
2 parents 0cbe743 + 000e974 commit b03f6d7
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 5 deletions.
3 changes: 2 additions & 1 deletion docker/PythonDockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ USER laravel
RUN pip install -U --no-cache-dir \
setuptools \
wheel \
lxml[html_clean] \
lxml \
lxml_html_clean \
#youtube api
youtube_transcript_api \
#ebook library
Expand Down
21 changes: 17 additions & 4 deletions tools/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from urllib import parse
from pysubparser import parser
from pysubparser.cleaners import formatting
import lxml.html.clean
import lxml_html_clean
import lxml.html
import importlib
import shutil
Expand Down Expand Up @@ -375,7 +375,7 @@ def tokenizeText(text, language, sentenceIndexStart = 0):
# loads an .epub file
def loadBook(file, sortMethod):
# rp and rt tags are used in adding prononciation over words, we need to remove the content of the tags
cleaner = lxml.html.clean.Cleaner(allow_tags=[''], remove_unknown_tags=False, kill_tags = ['rp','rt'], page_structure=False)
cleaner = lxml_html_clean.Cleaner(allow_tags=[''], remove_unknown_tags=False, kill_tags = ['rp','rt'], page_structure=False)
content = ''
book = epub.read_epub(file)
items = list(book.get_items())
Expand All @@ -390,7 +390,12 @@ def loadBook(file, sortMethod):

for item in sortedItems:
if item.get_type() == ebooklib.ITEM_DOCUMENT:
epubPage = cleaner.clean_html(item.get_content()).decode('utf-8')
# clean_html cannot be passed bytes but it cannot be passed a str
# with explicit encoding either. So you must convert it to a string
# and then use RegEx to remove the encoding declaration
content_str = item.get_content().decode()
content_str = re.sub(r'<\?xml[^>]+\?>', '', content_str, count=1)
epubPage = cleaner.clean_html(content_str)
# needed to removed extra div created by cleaner...
epubPage = lxml.html.fromstring(epubPage).text_content()
content += epubPage
Expand Down Expand Up @@ -620,7 +625,7 @@ def getWebsiteText():
"Russian": "https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl",
"Ukrainian": "https://github.com/explosion/spacy-models/releases/download/uk_core_news_sm-3.7.0/uk_core_news_sm-3.7.0-py3-none-any.whl",
"Chinese": "https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl",
"Turkish": "https://huggingface.co/turkish-nlp-suite/tr_core_news_md/resolve/main/tr_core_news_md-any-py3-none-any.whl",
"Turkish": "https://huggingface.co/turkish-nlp-suite/tr_core_news_md/resolve/main/tr_core_news_md-1.0-py3-none-any.whl",
"Thai": "spacy_thai",
}

Expand Down Expand Up @@ -656,6 +661,14 @@ def model_install():
"install",
"--target=/var/www/html/storage/app/model",
"tzdata"])
# https://stackoverflow.com/questions/78634235
if lang == "Turkish":
subprocess.check_output([
"pip",
"install",
"--target=/var/www/html/storage/app/model",
"numpy<2.0.0",
"--upgrade"])
importlib.invalidate_caches()
return HTTPResponse(status=200, body="Language and dependencies installed correctly")
except subprocess.CalledProcessError as e:
Expand Down

0 comments on commit b03f6d7

Please sign in to comment.