Skip to content

Commit

Permalink
Add SMT Job
Browse files Browse the repository at this point in the history
* Add unigram truecaser
* Add CPU only docker image
* Add Latin default tokenizer
* Add vim to docker image for rebasing
* Add SMT integration test
* Update CI packages
  • Loading branch information
johnml1135 committed May 20, 2024
1 parent f8f3fc5 commit 9d7c432
Show file tree
Hide file tree
Showing 64 changed files with 11,047 additions and 32 deletions.
3 changes: 2 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
"AWS_ACCESS_KEY_ID": "${localEnv:AWS_ACCESS_KEY_ID}",
"AWS_SECRET_ACCESS_KEY": "${localEnv:AWS_SECRET_ACCESS_KEY}",
"CLEARML_API_ACCESS_KEY": "${localEnv:CLEARML_API_ACCESS_KEY}",
"CLEARML_API_SECRET_KEY": "${localEnv:CLEARML_API_SECRET_KEY}"
"CLEARML_API_SECRET_KEY": "${localEnv:CLEARML_API_SECRET_KEY}",
"ENV_FOR_DYNACONF": "development"
},
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
Expand Down
2 changes: 1 addition & 1 deletion .devcontainer/dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ RUN apt-get update && \
apt-get install --no-install-recommends -y \
python$PYTHON_VERSION \
python$PYTHON_VERSION-distutils \
git curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \
git vim curl gdb ca-certificates gnupg2 tar make gcc libssl-dev zlib1g-dev libncurses5-dev \
libbz2-dev libreadline-dev libreadline6-dev libxml2-dev xz-utils libgdbm-dev libgdbm-compat-dev tk-dev dirmngr \
libxmlsec1-dev libsqlite3-dev libffi-dev liblzma-dev lzma lzma-dev uuid-dev && \
rm -rf /var/lib/apt/lists/*
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ jobs:
- name: Lint with isort
run: poetry run isort . --check-only
- name: Setup Node for pyright
uses: actions/setup-node@v3
uses: actions/setup-node@v4
with:
node-version: "12"
node-version: "14"
- name: Lint with pyright
run: |
npm install -g [email protected].313
npm install -g [email protected].362
poetry run pyright
- name: Test with pytest
run: poetry run pytest --cov --cov-report=xml
Expand Down
16 changes: 14 additions & 2 deletions .github/workflows/docker-build-push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,21 @@ on:
tags:
- "docker_*"

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

jobs:
docker:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- dockerfile: ./dockerfile
image: ghcr.io/sillsdev/machine.py
- dockerfile: ./dockerfile.cpu_only
image: ghcr.io/sillsdev/machine.py.cpu_only
steps:
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/free-disk-space@main
Expand All @@ -21,8 +33,7 @@ jobs:
id: meta
uses: docker/metadata-action@v4
with:
images: |
ghcr.io/${{ github.repository }}
images: ${{ matrix.image }}
tags: |
type=match,pattern=docker_(.*),group=1
flavor: |
Expand All @@ -39,6 +50,7 @@ jobs:
uses: docker/build-push-action@v4
with:
context: .
file: ${{ matrix.dockerfile }}
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
27 changes: 22 additions & 5 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
},
{
"name": "build_nmt_engine",
"type": "python",
"type": "debugpy",
"request": "launch",
"module": "machine.jobs.build_nmt_engine",
"justMyCode": false,
Expand Down Expand Up @@ -51,14 +51,31 @@
]
}
},
{
"name": "build_smt_engine",
"type": "debugpy",
"request": "launch",
"module": "machine.jobs.build_smt_engine",
"justMyCode": false,
"args": [
"--model-type",
"hmm",
"--build-id",
"build1",
"--save-model",
"myModelName"
]
},
{
"name": "Python: Debug Tests",
"type": "python",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"purpose": ["debug-test"],
"purpose": [
"debug-test"
],
"console": "integratedTerminal",
"justMyCode": false
}
]
}
}
5 changes: 3 additions & 2 deletions dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@ RUN ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python3 & \
ln -sfn /usr/bin/python${PYTHON_VERSION} /usr/bin/python

COPY --from=builder /src/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt && rm requirements.txt
RUN --mount=type=cache,target=/root/.cache \
pip install --no-cache-dir -r requirements.txt && rm requirements.txt

COPY . .
RUN pip install --no-deps . && rm -r *
RUN pip install --no-deps . && rm -r /root/*

CMD ["bash"]
36 changes: 36 additions & 0 deletions dockerfile.cpu_only
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#compatability with Tensorflow 2.6.0 as per https://www.tensorflow.org/install/source#gpu
ARG PYTHON_VERSION=3.11
ARG UBUNTU_VERSION=focal
ARG POETRY_VERSION=1.6.1

FROM python:$PYTHON_VERSION-slim as builder
ARG POETRY_VERSION

ENV POETRY_HOME=/opt/poetry
ENV POETRY_VENV=/opt/poetry-venv
ENV POETRY_CACHE_DIR=/opt/.cache

# Install poetry separated from system interpreter
RUN python3 -m venv $POETRY_VENV \
&& $POETRY_VENV/bin/pip install -U pip setuptools \
&& $POETRY_VENV/bin/pip install poetry==${POETRY_VERSION}

# Add `poetry` to PATH
ENV PATH="${PATH}:${POETRY_VENV}/bin"

WORKDIR /src
COPY poetry.lock pyproject.toml /src
RUN poetry export --with=gpu --without-hashes -f requirements.txt > requirements.txt


FROM python:$PYTHON_VERSION
WORKDIR /root

COPY --from=builder /src/requirements.txt .
RUN --mount=type=cache,target=/root/.cache \
pip install --no-cache-dir -r requirements.txt && rm requirements.txt

COPY . .
RUN pip install --no-deps . && rm -r /root/*

CMD ["bash"]
16 changes: 14 additions & 2 deletions machine/corpora/token_processors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import unicodedata
from typing import Sequence
from typing import Literal, Sequence


def lowercase(tokens: Sequence[str]) -> Sequence[str]:
Expand All @@ -14,8 +14,20 @@ def unescape_spaces(tokens: Sequence[str]) -> Sequence[str]:
return [(" " if t == "<space>" else t) for t in tokens]


def _get_normalization_form(normalization_form: str) -> Literal["NFC", "NFD", "NFKC", "NFKD"]:
if normalization_form == "NFC":
return "NFC"
if normalization_form == "NFD":
return "NFD"
if normalization_form == "NFKC":
return "NFKC"
if normalization_form == "NFKD":
return "NFKD"
raise ValueError(f"Unknown normalization form: {normalization_form}")


def normalize(normalization_form: str, tokens: Sequence[str]) -> Sequence[str]:
return [unicodedata.normalize(normalization_form, t) for t in tokens]
return [unicodedata.normalize(_get_normalization_form(normalization_form), t) for t in tokens]


def nfc_normalize(tokens: Sequence[str]) -> Sequence[str]:
Expand Down
4 changes: 4 additions & 0 deletions machine/jobs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from .clearml_shared_file_service import ClearMLSharedFileService
from .local_shared_file_service import LocalSharedFileService
from .nmt_engine_build_job import NmtEngineBuildJob
from .nmt_model_factory import NmtModelFactory
from .shared_file_service import PretranslationInfo, PretranslationWriter, SharedFileService
from .smt_engine_build_job import SmtEngineBuildJob

__all__ = [
"ClearMLSharedFileService",
"LocalSharedFileService",
"NmtEngineBuildJob",
"NmtModelFactory",
"PretranslationInfo",
"PretranslationWriter",
"SharedFileService",
"SmtEngineBuildJob",
]
2 changes: 1 addition & 1 deletion machine/jobs/build_nmt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
level=logging.INFO,
)

logger = logging.getLogger(__package__ + ".build_nmt_engine")
logger = logging.getLogger(str(__package__) + ".build_nmt_engine")


def run(args: dict) -> None:
Expand Down
73 changes: 73 additions & 0 deletions machine/jobs/build_smt_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import argparse
import logging
from typing import Callable, Optional

from clearml import Task

from ..utils.canceled_error import CanceledError
from ..utils.progress_status import ProgressStatus
from .clearml_shared_file_service import ClearMLSharedFileService
from .config import SETTINGS
from .smt_engine_build_job import SmtEngineBuildJob

# Setup logging
logging.basicConfig(
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
level=logging.INFO,
)

logger = logging.getLogger(str(__package__) + ".build_smt_engine")


def run(args: dict) -> None:
progress: Optional[Callable[[ProgressStatus], None]] = None
check_canceled: Optional[Callable[[], None]] = None
task = None
if args["clearml"]:
task = Task.init()

def clearml_check_canceled() -> None:
if task.get_status() == "stopped":
raise CanceledError

check_canceled = clearml_check_canceled

def clearml_progress(status: ProgressStatus) -> None:
if status.percent_completed is not None:
task.get_logger().report_single_value(name="progress", value=round(status.percent_completed, 4))

progress = clearml_progress

try:
logger.info("SMT Engine Build Job started")

SETTINGS.update(args)
shared_file_service = ClearMLSharedFileService(SETTINGS)
smt_engine_build_job = SmtEngineBuildJob(SETTINGS, shared_file_service)
smt_engine_build_job.run(progress=progress, check_canceled=check_canceled)
logger.info("Finished")
except Exception as e:
if task:
if task.get_status() == "stopped":
return
else:
task.mark_failed(status_reason=type(e).__name__, status_message=str(e))
raise e


def main() -> None:
parser = argparse.ArgumentParser(description="Trains an SMT model.")
parser.add_argument("--model-type", required=True, type=str, help="Model type")
parser.add_argument("--build-id", required=True, type=str, help="Build id")
parser.add_argument("--save-model", required=True, type=str, help="Save the model using the specified base name")
parser.add_argument("--clearml", default=False, action="store_true", help="Initializes a ClearML task")
parser.add_argument("--build-options", default=None, type=str, help="Build configurations")
args = parser.parse_args()

input_args = {k: v for k, v in vars(args).items() if v is not None}

run(input_args)


if __name__ == "__main__":
main()
32 changes: 32 additions & 0 deletions machine/jobs/local_shared_file_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import logging
import shutil
from pathlib import Path

from .shared_file_service import SharedFileService

logger = logging.getLogger(__name__)


class LocalSharedFileService(SharedFileService):
def _download_file(self, path: str, cache: bool = False) -> Path:
return self._get_path(path)

def _download_folder(self, path: str, cache: bool = False) -> Path:
return self._get_path(path)

def _exists_file(self, path: str) -> bool:
return self._get_path(path).exists()

def _upload_file(self, path: str, local_file_path: Path) -> None:
dst_path = self._get_path(path)
dst_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(local_file_path, dst_path)

def _upload_folder(self, path: str, local_folder_path: Path) -> None:
dst_path = self._get_path(path)
dst_path.mkdir(parents=True, exist_ok=True)
shutil.copyfile(local_folder_path, dst_path)

def _get_path(self, name: str) -> Path:
# Don't use shared file folder for local files
return Path(f"{self._shared_file_uri}/{name}")
Loading

0 comments on commit 9d7c432

Please sign in to comment.