diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml deleted file mode 100644 index a193b19..0000000 --- a/.github/workflows/integration-test.yml +++ /dev/null @@ -1,140 +0,0 @@ -# SPDX-FileCopyrightText: Nextcloud contributors -# SPDX-License-Identifier: AGPL-3.0-or-later - -name: Integration test - -on: - pull_request: - push: - branches: - - main - - stable* - -env: - APP_NAME: translate2 - -concurrency: - group: integration-test-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - - -jobs: - transcription: - runs-on: ubuntu-latest - - strategy: - # do not stop on another job's failure - fail-fast: false - matrix: - php-versions: [ '8.1' ] - databases: [ 'sqlite' ] - server-versions: [ 'master' ] - - name: Integration test on ${{ matrix.server-versions }} php@${{ matrix.php-versions }} - - env: - MYSQL_PORT: 4444 - PGSQL_PORT: 4445 - - services: - mysql: - image: mariadb:10.5 - ports: - - 4444:3306/tcp - env: - MYSQL_ROOT_PASSWORD: rootpassword - options: --health-cmd="mysqladmin ping" --health-interval 5s --health-timeout 2s --health-retries 5 - postgres: - image: postgres - ports: - - 4445:5432/tcp - env: - POSTGRES_USER: root - POSTGRES_PASSWORD: rootpassword - POSTGRES_DB: nextcloud - options: --health-cmd pg_isready --health-interval 5s --health-timeout 2s --health-retries 5 - - steps: - - name: Checkout server - uses: actions/checkout@v4 - with: - repository: nextcloud/server - ref: ${{ matrix.server-versions }} - - - name: Checkout submodules - shell: bash - run: | - auth_header="$(git config --local --get http.https://github.com/.extraheader)" - git submodule sync --recursive - git -c "http.extraheader=$auth_header" -c protocol.version=2 submodule update --init --force --recursive --depth=1 - - - name: Set up php ${{ matrix.php-versions }} - uses: shivammathur/setup-php@v2 - with: - php-version: ${{ matrix.php-versions }} - tools: phpunit - extensions: mbstring, iconv, fileinfo, intl, sqlite, pdo_mysql, pdo_sqlite, pgsql, pdo_pgsql, gd, zip - - - name: Checkout app - uses: actions/checkout@v4 - with: - path: ${{ env.APP_NAME }} - - - name: Checkout AppAPI - uses: actions/checkout@v4 - with: - repository: cloud-py-api/app_api - path: apps/app_api - - - name: Set up Nextcloud - if: ${{ matrix.databases != 'pgsql'}} - run: | - sleep 25 - mkdir data - ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$MYSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password - php -S localhost:8080 & - - - name: Set up Nextcloud - if: ${{ matrix.databases == 'pgsql'}} - run: | - sleep 25 - mkdir data - ./occ maintenance:install --verbose --database=${{ matrix.databases }} --database-name=nextcloud --database-host=127.0.0.1 --database-port=$PGSQL_PORT --database-user=root --database-pass=rootpassword --admin-user admin --admin-pass password - php -S localhost:8080 & - - - name: Enable app and app_api - run: ./occ app:enable -vvv -f app_api - - - name: Setup python 3.11 - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Install and init backend - env: - PYTHONUNBUFFERED: 1 - APP_HOST: 0.0.0.0 - APP_ID: translate2 - APP_PORT: 9081 - APP_SECRET: 12345 - APP_VERSION: 1.0.0 - NEXTCLOUD_URL: http://localhost:8080 - working-directory: ${{ env.APP_NAME }} - run: | - pip install -r requirements.txt - make download-a-few-models - python3 lib/main.py & - - - name: Register backend - run: | - ./occ app_api:daemon:register --net host manual_install "Manual Install" manual-install http localhost http://localhost:8080 - ./occ app_api:app:register translate2 manual_install --json-info "{\"appid\":\"translate2\",\"name\":\"Local Machine Translation\",\"daemon_config_name\":\"manual_install\",\"version\":\"1.0.0\",\"secret\":\"12345\",\"port\":9081,\"scopes\":[\"AI_PROVIDERS\"],\"system_app\":0}" --force-scopes --wait-finish - - - name: Scan files - run: | - curl --header "Content-Type: application/json" -X POST http://localhost:8080/ocs/v2.php/translation/translate --data '{"text":"Hallo Welt","fromLanguage":"de","toLanguage":"en"}' - - - name: Show log on failure - if: always() - run: | - tail data/nextcloud.log \ No newline at end of file diff --git a/.gitignore b/.gitignore index ce93ed1..5f54590 100644 --- a/.gitignore +++ b/.gitignore @@ -93,3 +93,4 @@ MANIFEST converted/ geckodriver.log +models/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..cb19835 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,27 @@ +ci: + skip: [pyright] + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-yaml + - id: check-toml + - id: mixed-line-ending + - id: trailing-whitespace + files: lib + - id: end-of-file-fixer + + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.5 + hooks: + - id: ruff + + - repo: local + hooks: + - id: pyright + name: pyright + entry: pyright + language: system + types: [python] + pass_filenames: false diff --git a/Dockerfile b/Dockerfile index 994e7ee..5338591 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,22 +1,38 @@ -FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04 +FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 -RUN \ - apt update && \ - apt install -y python3 python3-pip +ENV DEBIAN_FRONTEND noninteractive -COPY requirements.txt / +RUN apt-get update && \ + apt-get install -y software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install -y --no-install-recommends python3.11 python3.11-venv python3-pip vim git && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \ + apt-get -y clean && \ + rm -rf /var/lib/apt/lists/* -ADD cs[s] /app/css -ADD im[g] /app/img -ADD j[s] /app/js -ADD l10[n] /app/l10n -ADD li[b] /app/lib -ADD model[s] /app/models +# Set working directory +WORKDIR /app + +# Copy requirements files +COPY requirements.txt . + +# Install requirements +RUN python3 -m pip install --no-cache-dir --no-deps -r requirements.txt -RUN \ - python3 -m pip install -r requirements.txt && rm -rf ~/.cache && rm requirements.txt +ENV NVIDIA_VISIBLE_DEVICES all +ENV NVIDIA_DRIVER_CAPABILITIES compute +ENV DEBIAN_FRONTEND dialog + +# Copy application files +ADD cs[s] /app/css +ADD im[g] /app/img +ADD j[s] /app/js +ADD l10[n] /app/l10n +ADD li[b] /app/lib +ADD config.json /app/config.json +ADD languages.json /app/languages.json -WORKDIR /app/lib -ENTRYPOINT ["python3", "main.py"] +ENTRYPOINT ["python3", "lib/main.py"] LABEL org.opencontainers.image.source="https://github.com/nextcloud/translate2" diff --git a/README.md b/README.md index ce9fc2d..a8b6c47 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# Nextcloud Local Machine Translation \ No newline at end of file +# Nextcloud Local Machine Translation diff --git a/config.json b/config.json new file mode 100644 index 0000000..899f9c1 --- /dev/null +++ b/config.json @@ -0,0 +1,19 @@ +{ + "__comment::log_level": "Log level for the app, see https://docs.python.org/3/library/logging.html#logging-levels", + "__comment::tokenizer_file": "The tokenizer file name inside the model directory (loader.model_path)", + "__comment::loader": "CTranslate2 loader options, see https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html#ctranslate2.Translator.__init__. Use 'model_path' key for local paths or 'model_name' key for models hosted on Hugging Face. Both can't be used at the same time.", + "__comment::inference": "CTranslate2 inference options, see the kwargs in https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html#ctranslate2.Translator.translate_batch", + "__comment::changes_to_the_config": "the program needs to be restarted if you change this file since it is stored in memory on startup", + "log_level": 20, + "tokenizer_file": "spiece.model", + "loader": { + "model_name": "Nextcloud-AI/madlad400-3b-mt-ct2-int8_float32", + "inter_threads": 4, + "intra_threads": 0 + }, + "inference": { + "max_batch_size": 8192, + "sampling_temperature": 0.0001, + "disable_unk": true + } +} diff --git a/languages.json b/languages.json new file mode 100644 index 0000000..1f834a2 --- /dev/null +++ b/languages.json @@ -0,0 +1,420 @@ +{ + "en": "English", + "ru": "Russian", + "es": "Spanish", + "fr": "French", + "de": "German", + "it": "Italian", + "pt": "Portuguese", + "pl": "Polish", + "nl": "Dutch", + "vi": "Vietnamese", + "tr": "Turkish", + "sv": "Swedish", + "id": "Indonesian", + "ro": "Romanian", + "cs": "Czech", + "zh": "Mandarin Chinese", + "hu": "Hungarian", + "ja": "Japanese", + "th": "Thai", + "fi": "Finnish", + "fa": "Persian", + "uk": "Ukrainian", + "da": "Danish", + "el": "Greek", + "no": "Norwegian", + "bg": "Bulgarian", + "sk": "Slovak", + "ko": "Korean", + "ar": "Arabic", + "lt": "Lithuanian", + "ca": "Catalan", + "sl": "Slovenian", + "he": "Hebrew", + "et": "Estonian", + "lv": "Latvian", + "hi": "Hindi", + "sq": "Albanian", + "ms": "Malay", + "az": "Azerbaijani", + "sr": "Serbian", + "ta": "Tamil", + "hr": "Croatian", + "kk": "Kazakh", + "is": "Icelandic", + "ml": "Malayalam", + "mr": "Marathi", + "te": "Telugu", + "af": "Afrikaans", + "gl": "Galician", + "fil": "Filipino", + "be": "Belarusian", + "mk": "Macedonian", + "eu": "Basque", + "bn": "Bengali", + "ka": "Georgian", + "mn": "Mongolian", + "bs": "Bosnian", + "uz": "Uzbek", + "ur": "Urdu", + "sw": "Swahili", + "yue": "Cantonese", + "ne": "Nepali", + "kn": "Kannada", + "kaa": "Kara-Kalpak", + "gu": "Gujarati", + "si": "Sinhala", + "cy": "Welsh", + "eo": "Esperanto", + "la": "Latin", + "hy": "Armenian", + "ky": "Kyrghyz", + "tg": "Tajik", + "ga": "Irish", + "mt": "Maltese", + "my": "Myanmar (Burmese)", + "km": "Khmer", + "tt": "Tatar", + "so": "Somali", + "ku": "Kurdish (Kurmanji)", + "ps": "Pashto", + "pa": "Punjabi", + "rw": "Kinyarwanda", + "lo": "Lao", + "ha": "Hausa", + "dv": "Dhivehi", + "fy": "W. Frisian", + "lb": "Luxembourgish", + "ckb": "Kurdish (Sorani)", + "mg": "Malagasy", + "gd": "Scottish Gaelic", + "am": "Amharic", + "ug": "Uyghur", + "ht": "Haitian Creole", + "grc": "Ancient Greek", + "hmn": "Hmong", + "sd": "Sindhi", + "jv": "Javanese", + "mi": "Maori", + "tk": "Turkmen", + "ceb": "Cebuano", + "yi": "Yiddish", + "ba": "Bashkir", + "fo": "Faroese", + "or": "Odia (Oriya)", + "xh": "Xhosa", + "su": "Sundanese", + "kl": "Kalaallisut", + "ny": "Chichewa", + "sm": "Samoan", + "sn": "Shona", + "co": "Corsican", + "zu": "Zulu", + "ig": "Igbo", + "yo": "Yoruba", + "pap": "Papiamento", + "st": "Sesotho", + "haw": "Hawaiian", + "as": "Assamese", + "oc": "Occitan", + "cv": "Chuvash", + "lus": "Mizo", + "tet": "Tetum", + "gsw": "Swiss German", + "sah": "Yakut", + "br": "Breton", + "rm": "Romansh", + "sa": "Sanskrit", + "bo": "Tibetan", + "om": "Oromo", + "se": "N. Sami", + "ce": "Chechen", + "cnh": "Hakha Chin", + "ilo": "Ilocano", + "hil": "Hiligaynon", + "udm": "Udmurt", + "os": "Ossetian", + "lg": "Luganda", + "ti": "Tigrinya", + "vec": "Venetian", + "ts": "Tsonga", + "tyv": "Tuvinian", + "kbd": "Kabardian", + "ee": "Ewe", + "iba": "Iban", + "av": "Avar", + "kha": "Khasi", + "to": "Tonga (Tonga Islands)", + "tn": "Tswana", + "nso": "Sepedi", + "fj": "Fijian", + "zza": "Zaza", + "ak": "Twi", + "ada": "Adangme", + "otq": "Querétaro Otomi", + "dz": "Dzongkha", + "bua": "Buryat", + "cfm": "Falam Chin", + "ln": "Lingala", + "chm": "Meadow Mari", + "gn": "Guarani", + "krc": "Karachay-Balkar", + "wa": "Walloon", + "hif": "Fiji Hindi", + "yua": "Yucateco", + "srn": "Sranan Tongo", + "war": "Waray (Philippines)", + "rom": "Romani", + "bik": "Central Bikol", + "pam": "Pampanga", + "sg": "Sango", + "lu": "Luba-Katanga", + "ady": "Adyghe", + "kbp": "Kabiyè", + "syr": "Syriac", + "ltg": "Latgalian", + "myv": "Erzya", + "iso": "Isoko", + "kac": "Kachin", + "bho": "Bhojpuri", + "ay": "Aymara", + "kum": "Kumyk", + "qu": "Quechua", + "za": "Zhuang", + "pag": "Pangasinan", + "ngu": "Guerrero Nahuatl", + "ve": "Venda", + "pck": "Paite Chin", + "zap": "Zapotec", + "tyz": "Tày", + "hui": "Huli", + "bbc": "Batak Toba", + "tzo": "Tzotzil", + "tiv": "Tiv", + "ksd": "Kuanua", + "gom": "Goan Konkani", + "min": "Minangkabau", + "ang": "Old English", + "nhe": "E. Huasteca Nahuatl", + "bgp": "E. Baluchi", + "nzi": "Nzima", + "nnb": "Nande", + "nv": "Navajo", + "zxx": "Noise", + "bci": "Baoulé", + "kv": "Komi", + "new": "Newari", + "mps": "Dadibi", + "alt": "S. Altai", + "meu": "Motu", + "bew": "Betawi", + "fon": "Fon", + "iu": "Inuktitut", + "abt": "Ambulas", + "mgh": "Makhuwa-Meetto", + "mnw": "Mon", + "tvl": "Tuvalu", + "dov": "Dombe", + "tlh": "Klingon", + "ho": "Hiri Motu", + "kw": "Cornish", + "mrj": "Hill Mari", + "meo": "Kedah Malay", + "crh": "Crimean Tatar", + "mbt": "Matigsalug Manobo", + "emp": "N. Emberá", + "ace": "Achinese", + "ium": "Iu Mien", + "mam": "Mam", + "gym": "Ngäbere", + "mai": "Maithili", + "crs": "Seselwa Creole French", + "pon": "Pohnpeian", + "ubu": "Umbu-Ungu", + "fip": "Fipa", + "quc": "K’iche’", + "gv": "Manx", + "kj": "Kuanyama", + "btx": "Batak Karo", + "ape": "Bukiyip", + "chk": "Chuukese", + "rcf": "Réunion Creole French", + "shn": "Shan", + "tzh": "Tzeltal", + "mdf": "Moksha", + "ppk": "Uma", + "ss": "Swati", + "gag": "Gagauz", + "cab": "Garifuna", + "kri": "Krio", + "seh": "Sena", + "ibb": "Ibibio", + "tbz": "Ditammari", + "bru": "E. Bru", + "enq": "Enga", + "ach": "Acoli", + "cuk": "San Blas Kuna", + "kmb": "Kimbundu", + "wo": "Wolof", + "kek": "Kekchí", + "qub": "Huallaga Huánuco Quechua", + "tab": "Tabassaran", + "bts": "Batak Simalungun", + "kos": "Kosraean", + "rwo": "Rawa", + "cak": "Kaqchikel", + "tuc": "Mutu", + "bum": "Bulu", + "cjk": "Chokwe", + "gil": "Gilbertese", + "stq": "Saterfriesisch", + "tsg": "Tausug", + "quh": "S. Bolivian Quechua", + "mak": "Makasar", + "arn": "Mapudungun", + "ban": "Balinese", + "jiv": "Shuar", + "sja": "Epena", + "yap": "Yapese", + "tcy": "Tulu", + "toj": "Tojolabal", + "twu": "Termanu", + "xal": "Kalmyk", + "amu": "Guerrero Amuzgo", + "rmc": "Carpathian Romani", + "hus": "Huastec", + "nia": "Nias", + "kjh": "Khakas", + "bm": "Bambara", + "guh": "Guahibo", + "mas": "Masai", + "acf": "St Lucian Creole French", + "dtp": "Kadazan Dusun", + "ksw": "S’gaw Karen", + "bzj": "Belize Kriol English", + "din": "Dinka", + "zne": "Zande", + "mad": "Madurese", + "msi": "Sabah Malay", + "mag": "Magahi", + "mkn": "Kupang Malay", + "kg": "Kongo", + "lhu": "Lahu", + "ch": "Chamorro", + "qvi": "Imbabura H. Quichua", + "mh": "Marshallese", + "djk": "E. Maroon Creole", + "sus": "Susu", + "mfe": "Morisien", + "srm": "Saramaccan", + "dyu": "Dyula", + "ctu": "Chol", + "gui": "E. Bolivian Guaraní", + "pau": "Palauan", + "inb": "Inga", + "bi": "Bislama", + "mni": "Meiteilon (Manipuri)", + "guc": "Wayuu", + "jam": "Jamaican Creole English", + "wal": "Wolaytta", + "jac": "Popti’", + "bas": "Basa (Cameroon)", + "gor": "Gorontalo", + "skr": "Saraiki", + "nyu": "Nyungwe", + "noa": "Woun Meu", + "sda": "Toraja-Sa’dan", + "gub": "Guajajára", + "nog": "Nogai", + "cni": "Asháninka", + "teo": "Teso", + "tdx": "Tandroy-Mahafaly Malagasy", + "sxn": "Sangir", + "rki": "Rakhine", + "nr": "South Ndebele", + "frp": "Arpitan", + "alz": "Alur", + "taj": "E. Tamang", + "lrc": "N. Luri", + "cce": "Chopi", + "rn": "Rundi", + "jvn": "Caribbean Javanese", + "hvn": "Sabu", + "nij": "Ngaju", + "dwr": "Dawro", + "izz": "Izii", + "msm": "Agusan Manobo", + "bus": "Bokobaru", + "ktu": "Kituba (DRC)", + "chr": "Cherokee", + "maz": "Central Mazahua", + "tzj": "Tz’utujil", + "suz": "Sunwar", + "knj": "W. Kanjobal", + "bim": "Bimoba", + "gvl": "Gulay", + "bqc": "Boko (Benin)", + "tca": "Ticuna", + "pis": "Pijin", + "prk": "Parauk", + "laj": "Lango (Uganda)", + "mel": "Central Melanau", + "qxr": "Cañar H. Quichua", + "niq": "Nandi", + "ahk": "Akha", + "shp": "Shipibo-Conibo", + "hne": "Chhattisgarhi", + "spp": "Supyire Senoufo", + "koi": "Komi-Permyak", + "krj": "Kinaray-A", + "quf": "Lambayeque Quechua", + "luz": "S. Luri", + "agr": "Aguaruna", + "tsc": "Tswa", + "mqy": "Manggarai", + "gof": "Gofa", + "gbm": "Garhwali", + "miq": "Mískito", + "dje": "Zarma", + "awa": "Awadhi", + "bjj": "Kanauji", + "qvz": "N. Pastaza Quichua", + "sjp": "Surjapuri", + "tll": "Tetela", + "raj": "Rajasthani", + "kjg": "Khmu", + "bgz": "Banggai", + "quy": "Ayacucho Quechua", + "cbk": "Chavacano", + "akb": "Batak Angkola", + "oj": "Ojibwa", + "ify": "Keley-I Kallahan", + "mey": "Hassaniyya", + "ks": "Kashmiri", + "cac": "Chuj", + "brx": "Bodo (India)", + "qup": "S. Pastaza Quechua", + "syl": "Sylheti", + "jax": "Jambi Malay", + "ff": "Fulfulde", + "ber": "Tamazight (Tfng)", + "tks": "Takestani", + "trp": "Kok Borok", + "mrw": "Maranao", + "adh": "Adhola", + "smt": "Simte", + "srr": "Serer", + "ffm": "Maasina Fulfulde", + "qvc": "Cajamarca Quechua", + "mtr": "Mewari", + "ann": "Obolo", + "kaa-Latn": "Kara-Kalpak (Latn)", + "aa": "Afar", + "noe": "Nimadi", + "nut": "Nung (Viet Nam)", + "gyn": "Guyanese Creole English", + "kwi": "Awa-Cuaiquer", + "xmm": "Manado Malay", + "msb": "Masbatenyo" +} diff --git a/lib/Service.py b/lib/Service.py index e17fb6d..67ddcfe 100644 --- a/lib/Service.py +++ b/lib/Service.py @@ -1,59 +1,97 @@ +"""Translation service""" + +import json +import logging import os +from contextlib import contextmanager +from copy import deepcopy from time import perf_counter -from transformers import pipeline + +import ctranslate2 +from sentencepiece import SentencePieceProcessor +from util import clean_text + +GPU_ACCELERATED = os.getenv("COMPUTE_DEVICE", "cuda") != "cpu" + +logger = logging.getLogger(__name__) + +if os.getenv("CI") is not None: + ctranslate2.set_random_seed(420) + + +@contextmanager +def translate_context(config: dict): + try: + tokenizer = SentencePieceProcessor() + tokenizer.Load(os.path.join(config["loader"]["model_path"], config["tokenizer_file"])) + + translator = ctranslate2.Translator( + **{ + "device": "cuda" if GPU_ACCELERATED else "cpu", + **config["loader"], + } + ) + except KeyError as e: + raise Exception("Incorrect config file") from e + except Exception as e: + raise Exception("Error loading the translation model") from e + + try: + start = perf_counter() + yield (tokenizer, translator) + elapsed = perf_counter() - start + logger.info(f"time taken: {elapsed:.2f}s") + except Exception as e: + raise Exception("Error translating the input text") from e + finally: + del tokenizer + # todo: offload to cpu? + del translator + class Service: - dir_path = os.path.dirname(os.path.realpath(__file__)) + def __init__(self, config: dict): + global logger + try: + self.load_config(config) + ctranslate2.set_log_level(config["log_level"]) + logger.setLevel(config["log_level"]) + + with open("languages.json") as f: + self.languages = json.loads(f.read()) + except Exception as e: + raise Exception( + "Error reading languages list, ensure languages.json is present in the project root" + ) from e def get_lang_names(self): - return { - 'de': 'German', - 'en': 'English', - 'es': 'Spanish', - 'fr': 'French', - 'zh': 'Chinese', - 'it': 'Italian', - 'sv': 'Swedish', - 'ar': 'Arabic', - 'fi': 'Finnish', - 'nl': 'Dutch', - 'ja': 'Japanese', - 'tr': 'Turkish', - } - - def get_models(self): - models = [] - - for file in os.scandir(self.dir_path + "/../models/"): - if os.path.isdir(file.path): - models.append(file.name) - - return models - - def get_langs(self): - lang_names = self.get_lang_names() - from_languages = {} - to_languages = {} - for model_name in self.get_models(): - [from_language, to_language] = model_name.split('-', 2) - from_languages[from_language] = lang_names[from_language] - to_languages[to_language] = lang_names[to_language] - return from_languages, to_languages - - def translate(self, from_language, to_language, text): - model_name = from_language + "-" + to_language - print(f"model: {model_name}") - - if not model_name in self.get_models(): - if 'en-'+to_language in self.get_models() and from_language+'-en' in self.get_models(): - return self.translate('en', to_language, self.translate(from_language, 'en', text)) - - raise Exception('Requested model is not available') - - translator = pipeline("translation", model=self.dir_path + "/../models/" + model_name) - print("translating") - start = perf_counter() - translation = translator(text) - print(f"time taken {perf_counter() - start}") - print(translation) - return translation[0]['translation_text'] + return self.languages + + def load_config(self, config: dict): + config_copy = deepcopy(config) + config_copy["loader"].pop("model_name", None) + + if "hf_model_path" in config_copy["loader"]: + config_copy["loader"]["model_path"] = config_copy["loader"].pop("hf_model_path") + + self.config = config_copy + + def translate(self, to_language: str, text: str) -> str: + logger.debug(f"translating text to: {to_language}") + + with translate_context(self.config) as (tokenizer, translator): + input_tokens = tokenizer.Encode(f"<2{to_language}> {clean_text(text)}", out_type=str) + results = translator.translate_batch( + [input_tokens], + batch_type="tokens", + **self.config["inference"], + ) + + if len(results) == 0 or len(results[0].hypotheses) == 0: + raise Exception("Empty result returned from translator") + + # todo: handle multiple hypotheses + translation = tokenizer.Decode(results[0].hypotheses[0]) + + logger.info(f"Translated string: {translation}") + return translation diff --git a/lib/main.py b/lib/main.py index d5b81fd..3f92512 100644 --- a/lib/main.py +++ b/lib/main.py @@ -1,25 +1,56 @@ -"""Tha main module of the translate2 app -""" +"""The main module of the translate2 app""" +import logging +import os import queue import threading import typing from contextlib import asynccontextmanager -from fastapi import Depends, FastAPI, responses, Body +import uvicorn.logging +from dotenv import load_dotenv +from fastapi import Body, FastAPI, Request, responses from nc_py_api import AsyncNextcloudApp, NextcloudApp -from nc_py_api.ex_app import LogLvl, anc_app, run_app, set_handlers -import torch +from nc_py_api.ex_app import LogLvl, run_app, set_handlers from Service import Service +from util import load_config_file, save_config_file + +load_dotenv() + +config = load_config_file() + +# logging config +logging.basicConfig() +logger = logging.getLogger(__name__) +logger.setLevel(config["log_level"]) + + +class ModelConfig(dict): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def __setitem__(self, key, value): + if key == "path": + config["loader"]["hf_model_path"] = value + service.load_config(config) + save_config_file(config) + + super().__setitem__(key, value) + + +# download models if "model_name" key is present in the config +models_to_fetch = None +cache_dir = os.getenv("APP_PERSISTENT_STORAGE", "models/") +if "model_name" in config["loader"]: + models_to_fetch = { config["loader"]["model_name"]: ModelConfig({ "cache_dir": cache_dir }) } -cuda = torch.cuda.is_available() -service = Service() @asynccontextmanager -async def lifespan(_app: FastAPI): +async def lifespan(_: FastAPI): set_handlers( - APP, - enabled_handler, + fast_api_app=APP, + enabled_handler=enabled_handler, # type: ignore + models_to_fetch=models_to_fetch, # type: ignore ) t = BackgroundProcessTask() t.start() @@ -28,6 +59,23 @@ async def lifespan(_app: FastAPI): APP = FastAPI(lifespan=lifespan) TASK_LIST: queue.Queue = queue.Queue(maxsize=100) +service = Service(config) + + +@APP.exception_handler(Exception) +async def _(request: Request, exc: Exception): + logger.error("Error processing request", request.url.path, exc) + + task: dict | None = getattr(exc, "args", None) + + nc = NextcloudApp() + nc.log(LogLvl.ERROR, str(exc)) + if task: + nc.providers.translations.report_result(task["id"], error=str(exc)) + + return responses.JSONResponse({ + "error": "An error occurred while processing the request, please check the logs for more info" + }, 500) class BackgroundProcessTask(threading.Thread): @@ -35,30 +83,32 @@ def run(self, *args, **kwargs): # pylint: disable=unused-argument while True: task = TASK_LIST.get(block=True) try: - translation = service.translate(task.get("from_language"), task.get("to_language"), task.get("text")) + translation = service.translate(task["to_language"], task["text"]) NextcloudApp().providers.translations.report_result( task_id=task["id"], - result=str(translation), + result=str(translation).strip(), ) except Exception as e: # noqa - print(str(e)) - nc = NextcloudApp() - nc.log(LogLvl.ERROR, str(e)) - nc.providers.translations.report_result(task["id"], error=str(e)) - + e.args = task + raise e @APP.post("/translate") async def tiny_llama( - _nc: typing.Annotated[AsyncNextcloudApp, Depends(anc_app)], from_language: typing.Annotated[str, Body()], to_language: typing.Annotated[str, Body()], text: typing.Annotated[str, Body()], task_id: typing.Annotated[int, Body()], ): try: - print({"text": text, "from_language": from_language, "to_language": to_language, "id": task_id}) - TASK_LIST.put({"text": text, "from_language": from_language, "to_language": to_language, "id": task_id}, block=False) + task = { + "text": text, + "from_language": from_language, + "to_language": to_language, + "id": task_id, + } + logger.debug(task) + TASK_LIST.put(task) except queue.Full: return responses.JSONResponse(content={"error": "task queue is full"}, status_code=429) return responses.Response() @@ -67,16 +117,25 @@ async def tiny_llama( async def enabled_handler(enabled: bool, nc: AsyncNextcloudApp) -> str: print(f"enabled={enabled}") if enabled is True: - from_languages, to_languages = service.get_langs() - print(to_languages) - print(from_languages) - await nc.providers.translations.register('translate2', "Local Machine translation", '/translate', from_languages, to_languages) + languages = service.get_lang_names() + logger.info( + "Supported languages short list", { + "count": len(languages), + "languages": list(languages.keys())[:10], + } + ) + await nc.providers.translations.register( + "translate2", + "Local Machine Translation", + "/translate", + languages, + languages, + ) else: - await nc.providers.speech_to_text.unregister('translate2') + await nc.providers.speech_to_text.unregister("translate2") return "" - - if __name__ == "__main__": - run_app("main:APP", log_level="trace") + uvicorn_log_level = uvicorn.logging.TRACE_LOG_LEVEL if config["log_level"] == logging.DEBUG else config["log_level"] + run_app("main:APP", log_level=uvicorn_log_level) diff --git a/lib/util.py b/lib/util.py new file mode 100644 index 0000000..02be859 --- /dev/null +++ b/lib/util.py @@ -0,0 +1,23 @@ +"""Utility functions""" + +import json +import re + + +def clean_text(text: str) -> str: + return re.sub(r"(\r?\n)+", " ", text).strip() + + +def load_config_file(path: str = "config.json") -> dict: + with open(path) as f: + config = json.loads(f.read()) + if "model_name" in config["loader"] and "model_path" in config["loader"]: + raise Exception("Both 'model_name' and 'model_path' keys are present in the config. Please remove one of them.") # noqa: E501 + if "model_name" not in config["loader"] and "model_path" not in config["loader"]: + raise Exception("Neither 'model_name' nor 'model_path' keys are present in the config. Please add one of them.") # noqa: E501 + return config + + +def save_config_file(config: dict, path: str = "config.json") -> None: + with open(path, "w") as f: + f.write(json.dumps(config, indent=4)) diff --git a/pyproject.toml b/pyproject.toml index 7e1f410..fe7f01f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ preview = true line-length = 120 target-version = "py310" select = ["A", "B", "C", "D", "E", "F", "G", "I", "S", "SIM", "PIE", "Q", "RET", "RUF", "UP" , "W"] -extend-ignore = ["D101", "D102", "D103", "D105", "D107", "D203", "D213", "D401", "I001", "RUF100", "D400", "D415"] +extend-ignore = ["D101", "D102", "D103", "D105", "D107", "D203", "D213", "D401", "I001", "RUF100", "D400", "D415", "G004"] [tool.isort] profile = "black" diff --git a/requirements.in.txt b/requirements.in.txt new file mode 100644 index 0000000..a99069d --- /dev/null +++ b/requirements.in.txt @@ -0,0 +1,5 @@ +fastapi +ctranslate2 +huggingface_hub +nc_py_api[app]>=0.15.0 +sentencepiece diff --git a/requirements.txt b/requirements.txt index 8135952..a7b8d6a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,48 @@ -nc_py_api[app]>=0.8.0 -torch -transformers -sentencepiece \ No newline at end of file +annotated-types==0.7.0 +anyio==4.4.0 +certifi==2024.7.4 +charset-normalizer==3.3.2 +click==8.1.7 +ctranslate2==4.3.1 +dnspython==2.6.1 +email_validator==2.2.0 +fastapi==0.111.0 +fastapi-cli==0.0.4 +filelock==3.15.4 +fsspec==2024.6.1 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +huggingface-hub==0.23.4 +idna==3.7 +Jinja2==3.1.4 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +mdurl==0.1.2 +nc-py-api==0.15.1 +numpy==2.0.0 +orjson==3.10.6 +packaging==24.1 +pydantic==2.8.2 +pydantic_core==2.20.1 +Pygments==2.18.0 +python-dotenv==1.0.1 +python-multipart==0.0.9 +PyYAML==6.0.1 +requests==2.32.3 +rich==13.7.1 +sentencepiece==0.2.0 +shellingham==1.5.4 +sniffio==1.3.1 +starlette==0.37.2 +tqdm==4.66.4 +typer==0.12.3 +typing_extensions==4.12.2 +ujson==5.10.0 +urllib3==2.2.2 +uvicorn==0.30.1 +uvloop==0.19.0 +watchfiles==0.22.0 +websockets==12.0 +xmltodict==0.13.0