Skip to content

Commit

Permalink
[Arch] Use hash to avoid repeat building EventStreamRuntime image (A…
Browse files Browse the repository at this point in the history
…ll-Hands-AI#3243)

* update the behavior of put source code to put files instead of tarball

* add dishash to dependency

* fix dockerfile copy

* use dirhash to avoid repeat building for update source

* fix runtime_build testcase

* add dir_hash to docker build pipeline

* add additional tests for source directory

* add comment

* clear the assertion by explictly check existing files

* also assert od is a dir
  • Loading branch information
xingyaoww authored Aug 5, 2024
1 parent abec52a commit a69120d
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 93 deletions.
5 changes: 5 additions & 0 deletions containers/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ fi
if [[ -n "$DOCKER_IMAGE_TAG" ]]; then
tags+=("$DOCKER_IMAGE_TAG")
fi
# If $DOCKER_IMAGE_HASH_TAG is set, add it to the tags
if [[ -n "$DOCKER_IMAGE_HASH_TAG" ]]; then
tags+=("$DOCKER_IMAGE_HASH_TAG")
fi


DOCKER_REPOSITORY="$DOCKER_REGISTRY/$DOCKER_ORG/$DOCKER_IMAGE"
DOCKER_REPOSITORY=${DOCKER_REPOSITORY,,} # lowercase
Expand Down
2 changes: 0 additions & 2 deletions containers/runtime/config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,3 @@ DOCKER_BASE_DIR="./containers/runtime"
# These two variables will be appended by the runtime_build.py script
# DOCKER_IMAGE=
# DOCKER_IMAGE_TAG=
DOCKER_IMAGE=od_runtime
DOCKER_IMAGE_TAG=od_v0.8.1_image_ubuntu_tag_22.04
135 changes: 96 additions & 39 deletions opendevin/runtime/utils/runtime_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import docker
import toml
from dirhash import dirhash
from jinja2 import Environment, FileSystemLoader

import opendevin
Expand Down Expand Up @@ -47,7 +48,8 @@ def _create_project_source_dist():
return tarball_path


def _put_source_code_to_dir(temp_dir: str) -> str:
def _put_source_code_to_dir(temp_dir: str):
"""Put the source code of OpenDevin to the temp_dir/code."""
tarball_path = _create_project_source_dist()
filename = os.path.basename(tarball_path)
filename = filename.removesuffix('.tar.gz')
Expand All @@ -59,12 +61,18 @@ def _put_source_code_to_dir(temp_dir: str) -> str:
logger.info(
f'Source distribution moved to {os.path.join(temp_dir, "project.tar.gz")}'
)
return filename

# unzip the tarball
shutil.unpack_archive(os.path.join(temp_dir, 'project.tar.gz'), temp_dir)
# remove the tarball
os.remove(os.path.join(temp_dir, 'project.tar.gz'))
# rename the directory to the 'code'
os.rename(os.path.join(temp_dir, filename), os.path.join(temp_dir, 'code'))
logger.info(f'Unpacked source code directory: {os.path.join(temp_dir, "code")}')


def _generate_dockerfile(
base_image: str,
source_code_dirname: str,
skip_init: bool = False,
extra_deps: str | None = None,
) -> str:
Expand All @@ -77,7 +85,6 @@ def _generate_dockerfile(
template = env.get_template('Dockerfile.j2')
dockerfile_content = template.render(
base_image=base_image,
source_code_dirname=source_code_dirname,
skip_init=skip_init,
extra_deps=extra_deps if extra_deps is not None else '',
)
Expand All @@ -89,12 +96,14 @@ def prep_docker_build_folder(
base_image: str,
skip_init: bool = False,
extra_deps: str | None = None,
):
"""Prepares the docker build folder by copying the source code and generating the Dockerfile."""
source_code_dirname = _put_source_code_to_dir(dir_path)
) -> str:
"""Prepares the docker build folder by copying the source code and generating the Dockerfile.
Return the MD5 hash of the directory.
"""
_put_source_code_to_dir(dir_path)
dockerfile_content = _generate_dockerfile(
base_image,
source_code_dirname,
skip_init=skip_init,
extra_deps=extra_deps,
)
Expand All @@ -108,14 +117,28 @@ def prep_docker_build_folder(
with open(os.path.join(dir_path, 'Dockerfile'), 'w') as file:
file.write(dockerfile_content)

hash = dirhash(dir_path, 'md5')
logger.info(
f'Input base image: {base_image}\n'
f'Skip init: {skip_init}\n'
f'Extra deps: {extra_deps}\n'
f'Hash for docker build directory [{dir_path}] (contents: {os.listdir(dir_path)}): {hash}\n'
)
return hash


def _build_sandbox_image(
base_image: str,
target_image_name: str,
docker_client: docker.DockerClient,
skip_init: bool = False,
extra_deps: str | None = None,
):
) -> str:
"""Build the sandbox image and return the *hash* docker image name.
The hash is calculated based on the contents of the docker build folder (source code and Dockerfile). This is useful to help prevent rebuilding the image when the source code and Dockerfile are unchanged.
"""
target_repo, target_image_tag = target_image_name.split(':')
try:
with tempfile.TemporaryDirectory() as temp_dir:
if skip_init:
Expand All @@ -124,36 +147,62 @@ def _build_sandbox_image(
)
else:
logger.info(f'Building agnostic sandbox image: {target_image_name}')
prep_docker_build_folder(

dir_hash = prep_docker_build_folder(
temp_dir, base_image, skip_init=skip_init, extra_deps=extra_deps
)
api_client = docker_client.api
build_logs = api_client.build(
path=temp_dir,
tag=target_image_name,
rm=True,
decode=True,
# do not use cache when skip_init is True (i.e., when we want to update the source code in the existing image)
nocache=skip_init,
)
# Use dir_hash as an alternative tag for the image
# This is useful to help prevent rebuilding the image when the source code/Dockerfile is the same
target_image_hash_name = f'{target_repo}:{dir_hash}'

if skip_init:
# Check if the hash image exists
if _check_image_exists(target_image_hash_name, docker_client):
logger.info(f'Image {target_image_hash_name} exists, skipping build.')
else:
logger.info(
f'Image {target_image_name} does not exist, neither does its hash {target_image_hash_name}.\n'
'Building the image...'
)

api_client = docker_client.api
build_logs = api_client.build(
path=temp_dir,
tag=target_image_hash_name,
rm=True,
decode=True,
# do not use cache when skip_init is True (i.e., when we want to update the source code in the existing image)
nocache=skip_init,
)

if skip_init:
logger.info(
f'Rebuilding existing od_sandbox image [{target_image_name}] to update the source code.'
)
for log in build_logs:
if 'stream' in log:
print(log['stream'].strip())
elif 'error' in log:
logger.error(log['error'].strip())
else:
logger.info(str(log))

logger.info(f'Image {target_image_hash_name} build finished.')
image = docker_client.images.get(target_image_hash_name)
image.tag(target_repo, target_image_tag)
logger.info(
f'Rebuilding existing od_sandbox image [{target_image_name}] to update the source code.'
f'Tagged image {target_image_hash_name} --> {target_image_name}'
)
for log in build_logs:
if 'stream' in log:
print(log['stream'].strip())
elif 'error' in log:
logger.error(log['error'].strip())
else:
logger.info(str(log))

# check if the image is built successfully
image = docker_client.images.get(target_image_name)
image = docker_client.images.get(target_image_hash_name)
if image is None:
raise RuntimeError(f'Build failed: Image {target_image_name} not found')
logger.info(f'Image {target_image_name} built successfully')
raise RuntimeError(
f'Build failed: Image {target_image_hash_name} / {target_image_name} not found'
)
logger.info(
f'Image {target_image_name} (hash: {target_image_hash_name}) built successfully'
)
return target_image_hash_name
except docker.errors.BuildError as e:
logger.error(f'Sandbox image build failed: {e}')
raise e
Expand Down Expand Up @@ -183,6 +232,16 @@ def get_new_image_name(base_image: str, dev_mode: bool = False) -> str:


def _check_image_exists(image_name: str, docker_client: docker.DockerClient) -> bool:
"""Check if the image exists in the registry (try to pull it first) AND in the local store.
image_name is f'{repo}:{tag}'
"""
# Try to pull the new image from the registry
try:
docker_client.images.pull(image_name)
except Exception:
logger.info(f'Cannot pull image {image_name} directly')

images = docker_client.images.list()
if images:
for image in images:
Expand Down Expand Up @@ -217,12 +276,6 @@ def build_runtime_image(
f'Invalid image name: {new_image_name}. Expected format "repository:tag".'
)

# Try to pull the new image from the registry
try:
docker_client.images.pull(new_image_name)
except Exception:
logger.info(f'Cannot pull image {new_image_name} directly')

# Detect if the sandbox image is built
image_exists = _check_image_exists(new_image_name, docker_client)
if image_exists:
Expand All @@ -235,6 +288,7 @@ def build_runtime_image(
# If (1) Image exists & we are not updating the source code, we can reuse the existing production image
logger.info('No image build done (not updating source code)')
return new_image_name

elif image_exists and update_source_code:
# If (2) Image exists & we plan to update the source code (in dev mode), we need to rebuild the image
# and give it a special name
Expand All @@ -244,6 +298,7 @@ def build_runtime_image(
new_image_name = get_new_image_name(base_image, dev_mode=True)

skip_init = True # since we only need to update the source code

else:
# If (3) Image does not exist, we need to build it from scratch
# e.g., ubuntu:latest -> od_runtime:ubuntu_tag_latest
Expand All @@ -260,7 +315,7 @@ def build_runtime_image(
if not skip_init:
logger.info(f'Building image [{new_image_name}] from scratch')

_build_sandbox_image(
new_image_name = _build_sandbox_image(
base_image,
new_image_name,
docker_client,
Expand Down Expand Up @@ -299,15 +354,17 @@ def build_runtime_image(
f'Will prepare a build folder by copying the source code and generating the Dockerfile: {build_folder}'
)
new_image_path = get_new_image_name(args.base_image)
prep_docker_build_folder(
dir_hash = prep_docker_build_folder(
build_folder, args.base_image, skip_init=args.update_source_code
)
new_image_name, new_image_tag = new_image_path.split(':')
with open(os.path.join(build_folder, 'config.sh'), 'a') as file:
file.write(
(
f'\n'
f'DOCKER_IMAGE={new_image_name}\n'
f'DOCKER_IMAGE_TAG={new_image_tag}\n'
f'DOCKER_IMAGE_HASH_TAG={dir_hash}\n'
)
)
logger.info(
Expand Down
4 changes: 1 addition & 3 deletions opendevin/runtime/utils/runtime_templates/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,8 @@ RUN /opendevin/miniforge3/bin/mamba install conda-forge::poetry python=3.11 -y
# ================================================================
# START: Copy Project and Install/Update Dependencies
# ================================================================
COPY project.tar.gz /opendevin
RUN if [ -d /opendevin/code ]; then rm -rf /opendevin/code; fi
RUN cd /opendevin && tar -xzvf project.tar.gz && rm project.tar.gz
RUN mv /opendevin/{{ source_code_dirname }} /opendevin/code
COPY ./code /opendevin/code

# Install/Update Dependencies
# 1. Install pyproject.toml via poetry
Expand Down
33 changes: 31 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ grep-ast = "0.3.3"
tree-sitter = "0.21.3"
bashlex = "^0.18"
pyjwt = "^2.9.0"
dirhash = "*"

[tool.poetry.group.llama-index.dependencies]
llama-index = "*"
Expand Down
Loading

0 comments on commit a69120d

Please sign in to comment.