Skip to content

Commit

Permalink
RE-Allow adding extrenal directory to docker image (#695)
Browse files Browse the repository at this point in the history
* add mounting dir

* minor fix

* support abs and rel path

* add docs

* refactor to extra context

* minor fix docs

* minor fix

* modify docs
  • Loading branch information
blahBlahhhJ authored Aug 15, 2024
1 parent 441af5c commit ef6349c
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 5 deletions.
7 changes: 7 additions & 0 deletions docker/tpu/Dockerfile.incremental
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ ARG TAG=latest

FROM ${IMAGE}:${TAG}

# This usually is a config directory so users can have their own config directory outside the repo.
ARG EXTRA_CTX=/config

ENV TENSORSTORE_CURL_LOW_SPEED_TIME_SECONDS=60\
TENSORSTORE_CURL_LOW_SPEED_LIMIT_BYTES=1024\
RAY_USAGE_STATS_ENABLED=0\
Expand All @@ -17,3 +20,7 @@ RUN mkdir -p /opt/levanter/src
ADD pyproject.toml README.md /opt/levanter/
RUN pip install -e '.[test]'
ADD . /opt/levanter

# Add $EXTRA_CTX to the same location as in local machine.
# so that the same (config) path(s) specified in train_lm.py argument still works
COPY .mnt $EXTRA_CTX
7 changes: 7 additions & 0 deletions docs/Getting-Started-TPU-VM.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,13 @@ To run in the foreground, use `--foreground` with the `launch.py` script. You sh
python infra/launch.py -- python src/levanter/main/train_lm.py --config_path config/gpt2_small.yaml --trainer.checkpointer.base_path gs://<somewhere>'
```
### Using external directory/file
In case that you want to reference some external directory/file outside of the levanter repo, you can do it by adding the external directory/file to the docker image so that it becomes accessible in TPU instances. You can specify the path you want to add as extra buildl context by `--extra_context` with the `launch.py` script. Then, you should be able to use the external files in arguments in `train_lm.py` etc.
```bash
python infra/launch.py --extra_context <external path> -- python src/levanter/main/train_lm.py --config_path <external path> --trainer.checkpointer.base_path gs://<somewhere>'
```

### Babysitting Script

If you are using a preemptible TPU VM, you probably want to use the "babysitting" script that automatically re-creates
Expand Down
5 changes: 5 additions & 0 deletions infra/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import subprocess
import time
from pathlib import Path

from infra import push_docker
from infra.helpers import cli
Expand Down Expand Up @@ -210,6 +211,7 @@ def _default_run_id():
cli.add_arg(parser, config, ["--docker_registry"], default="gcp", choices=["gcp", "ghcr"])
cli.add_arg(parser, config, ["--github_user"], type=str)
cli.add_arg(parser, config, ["--github_token"], type=str)
cli.add_arg(parser, config, ["--extra_context"], type=Path, default=Path("config"))

parser.add_argument(
"-e", "--env", action="append", nargs=2, metavar=("KEY", "VALUE"), default=list(config.get("env", {}).items())
Expand Down Expand Up @@ -239,6 +241,7 @@ def _default_run_id():
registry = args.docker_registry
github_user = args.github_user
github_token = args.github_token
extra_context = args.extra_context

region = "-".join(zone.split("-")[:-1])
env = {k: v for k, v in args.env}
Expand All @@ -259,6 +262,7 @@ def _default_run_id():
github_user=github_user,
github_token=github_token,
docker_file="docker/tpu/Dockerfile.incremental",
extra_context=extra_context,
)
elif registry == "gcp":
full_image_id = push_docker.push_to_gcp(
Expand All @@ -268,6 +272,7 @@ def _default_run_id():
image_name=image_id,
tag=tag,
docker_file="docker/tpu/Dockerfile.incremental",
extra_context=extra_context,
)
else:
raise ValueError(f"Unknown docker registry: {args.docker_registry}")
Expand Down
45 changes: 40 additions & 5 deletions infra/push_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@

import argparse
import json
import os
import pty
import shutil
import subprocess
import sys
from pathlib import Path

from infra.helpers import cli

Expand All @@ -35,6 +38,27 @@
]


def _rm(path):
if path.is_dir():
shutil.rmtree(path, ignore_errors=True)
elif path.is_file():
os.remove(path)
elif path.exists():
raise RuntimeError(f"Remove failed. Path ({path}) is neither a directory nor a file.")


def _cp(src, dst):
# delete dst if exists
_rm(dst)

if src.is_dir():
shutil.copytree(src, dst)
elif src.is_file():
shutil.copy(src, dst)
else:
raise RuntimeError(f"Copy failed. Source path ({src}) is neither a directory nor a file. Check if it exists.")


def _run(argv):
if sys.stdout.isatty():
exit_code = pty.spawn(argv)
Expand Down Expand Up @@ -128,14 +152,22 @@ def configure_gcp_docker(project_id, region, repository):
_run(["gcloud", "auth", "configure-docker", "--quiet", f"{region}-docker.pkg.dev"])


def build_docker(docker_file, image_name, tag) -> str:
def build_docker(docker_file, image_name, tag, mount_src) -> str:
"""Builds a Docker image, enables artifact access, and pushes to Artifact Registry."""
# Copy external files temporarily to .mnt
mount_dst = Path(".mnt")
_cp(mount_src, mount_dst)

# Get mounting path in docker image.
levanter_path = Path("/opt/levanter")
extra_context = levanter_path / mount_src
_run(
[
"docker",
"buildx",
"build",
"--build-arg",
f"EXTRA_CTX={extra_context.resolve()}",
"--platform=linux/amd64",
"-t",
f"{image_name}:{tag}",
Expand All @@ -144,12 +176,14 @@ def build_docker(docker_file, image_name, tag) -> str:
".",
]
)
# clean up after building
_rm(mount_dst)

return f"{image_name}:{tag}"


# Disabled until we can figure out how Docker hub organizations work
def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None):
def push_to_github(local_image, tag, github_user=None, github_token=None, docker_file=None, extra_context=None):
"""Pushes a local Docker image to Docker Hub."""

# Authenticate the docker service with Github if a token exists
Expand All @@ -160,17 +194,17 @@ def push_to_github(local_image, tag, github_user=None, github_token=None, docker
print(login_process.communicate(input=github_token.encode(), timeout=10))

remote_name = f"ghcr.io/{github_user}/{local_image}:{tag}"
local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag)
local_name = build_docker(docker_file=docker_file, image_name=local_image, tag=tag, mount_src=extra_context)

_run(["docker", "tag", local_name, remote_name])
_run(["docker", "push", remote_name])
return remote_name


def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) -> str:
def push_to_gcp(project_id, region, repository, image_name, tag, docker_file, extra_context) -> str:
"""Pushes a local Docker image to Artifact Registry."""
configure_gcp_docker(project_id, region, repository)
local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag)
local_image = build_docker(docker_file=docker_file, image_name=image_name, tag=tag, mount_src=extra_context)

artifact_repo = f"{region}-docker.pkg.dev/{project_id}/{repository}"

Expand Down Expand Up @@ -214,4 +248,5 @@ def push_to_gcp(project_id, region, repository, image_name, tag, docker_file) ->
args.image,
args.tag,
docker_file=args.docker_file,
extra_context=Path("config"),
)

0 comments on commit ef6349c

Please sign in to comment.