Skip to content

Commit

Permalink
[cherry-pick] Add the pytorch-mnist with GPU support container image (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
tenzen-y authored Jul 17, 2022
1 parent 8dcc7d3 commit 12a4896
Show file tree
Hide file tree
Showing 15 changed files with 61 additions and 25 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/publish-trial-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@ jobs:
include:
- trial-name: mxnet-mnist
dockerfile: examples/v1beta1/trial-images/mxnet-mnist/Dockerfile
- trial-name: pytorch-mnist
dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile
- trial-name: pytorch-mnist-cpu
dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu
- trial-name: pytorch-mnist-gpu
dockerfile: examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu
- trial-name: tf-mnist-with-summaries
dockerfile: examples/v1beta1/trial-images/tf-mnist-with-summaries/Dockerfile
- trial-name: enas-cnn-cifar10-gpu
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pytorch-mnist-e2e-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
experiments: ${{ matrix.experiments }}
training-operator: true
# Comma Delimited
trial-images: pytorch-mnist
trial-images: pytorch-mnist-cpu

strategy:
fail-fast: false
Expand Down
17 changes: 14 additions & 3 deletions docs/images-location.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,13 +273,24 @@ The following table shows images for training containers which are used in the
</tr>
<tr align="center">
<td>
<code>docker.io/kubeflowkatib/pytorch-mnist</code>
<code>docker.io/kubeflowkatib/pytorch-mnist-cpu</code>
</td>
<td>
PyTorch MNIST example with printing metrics to the file or StdOut
PyTorch MNIST example with printing metrics to the file or StdOut with CPU support
</td>
<td>
<a href="https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile">Dockerfile</a>
<a href="https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.cpu">Dockerfile</a>
</td>
</tr>
<tr align="center">
<td>
<code>docker.io/kubeflowkatib/pytorch-mnist-gpu</code>
</td>
<td>
PyTorch MNIST example with printing metrics to the file or StdOut with GPU support
</td>
<td>
<a href="https://github.com/kubeflow/katib/blob/master/examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu">Dockerfile</a>
</td>
</tr>
<tr align="center">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ spec:
spec:
containers:
- name: pytorch
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
Expand All @@ -61,7 +61,7 @@ spec:
spec:
containers:
- name: pytorch
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ spec:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
Expand Down
15 changes: 15 additions & 0 deletions examples/v1beta1/trial-images/pytorch-mnist/Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime

ADD examples/v1beta1/trial-images/pytorch-mnist /opt/pytorch-mnist
WORKDIR /opt/pytorch-mnist

# Add folder for the logs.
RUN mkdir /katib
RUN pip install --no-cache-dir -r requirements.txt

RUN chgrp -R 0 /opt/pytorch-mnist \
&& chmod -R g+rwX /opt/pytorch-mnist \
&& chgrp -R 0 /katib \
&& chmod -R g+rwX /katib

ENTRYPOINT ["python3", "/opt/pytorch-mnist/mnist.py"]
4 changes: 2 additions & 2 deletions manifests/v1beta1/components/controller/trial-templates.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ data:
spec:
containers:
- name: pytorch
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
Expand All @@ -68,7 +68,7 @@ data:
spec:
containers:
- name: pytorch
image: docker.io/kubeflowkatib/pytorch-mnist:v0.14.0-rc.0
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0-rc.0
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
Expand Down
7 changes: 5 additions & 2 deletions scripts/v1beta1/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,11 @@ else
echo -e "\nBuilding mxnet mnist training container example...\n"
docker build --platform linux/amd64 -t "${REGISTRY}/mxnet-mnist:${TAG}" -f examples/${VERSION}/trial-images/mxnet-mnist/Dockerfile .

echo -e "\nBuilding PyTorch mnist training container example...\n"
docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile .
echo -e "\nBuilding PyTorch mnist training container example with CPU support...\n"
docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist-cpu:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile.cpu .

echo -e "\nBuilding PyTorch mnist training container example with GPU support...\n"
docker build --platform linux/amd64 -t "${REGISTRY}/pytorch-mnist-gpu:${TAG}" -f examples/${VERSION}/trial-images/pytorch-mnist/Dockerfile.gpu .

echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n"
docker build --platform linux/amd64 -t "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}" -f examples/${VERSION}/trial-images/enas-cnn-cifar10/Dockerfile.gpu .
Expand Down
7 changes: 5 additions & 2 deletions scripts/v1beta1/push.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,11 @@ docker push "${REGISTRY}/mxnet-mnist:${TAG}"
echo -e "\nPushing Tensorflow with summaries mnist training container example...\n"
docker push "${REGISTRY}/tf-mnist-with-summaries:${TAG}"

echo -e "\nPushing PyTorch mnist training container example...\n"
docker push "${REGISTRY}/pytorch-mnist:${TAG}"
echo -e "\nPushing PyTorch mnist training container example with CPU support...\n"
docker push "${REGISTRY}/pytorch-mnist-cpu:${TAG}"

echo -e "\nPushing PyTorch mnist training container example with GPU support...\n"
docker push "${REGISTRY}/pytorch-mnist-gpu:${TAG}"

echo -e "\nPushing Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n"
docker push "${REGISTRY}/enas-cnn-cifar10-gpu:${TAG}"
Expand Down
6 changes: 4 additions & 2 deletions scripts/v1beta1/update-images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ update_yaml_files "${CONFIG_PATH}" ":[^[:space:]].*\"" ":${TAG}\""

# Postfixes for the each Trial image.
MXNET_MNIST="mxnet-mnist"
PYTORCH_MNIST="pytorch-mnist"
PYTORCH_MNIST_CPU="pytorch-mnist-cpu"
PYTORCH_MNIST_GPU="pytorch-mnist-gpu"
TF_MNIST_WITH_SUMMARIES="tf-mnist-with-summaries"
ENAS_GPU="enas-cnn-cifar10-gpu"
ENAS_CPU="enas-cnn-cifar10-cpu"
Expand All @@ -93,7 +94,8 @@ SIMPLE_PBT="simple-pbt"

echo -e "Update Katib Trial training container images\n"
update_yaml_files "./" "${OLD_PREFIX}${MXNET_MNIST}:.*" "${NEW_PREFIX}${MXNET_MNIST}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST}:.*" "${NEW_PREFIX}${PYTORCH_MNIST}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_CPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_CPU}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${PYTORCH_MNIST_GPU}:.*" "${NEW_PREFIX}${PYTORCH_MNIST_GPU}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${TF_MNIST_WITH_SUMMARIES}:.*" "${NEW_PREFIX}${TF_MNIST_WITH_SUMMARIES}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${ENAS_GPU}:.*" "${NEW_PREFIX}${ENAS_GPU}:${TAG}"
update_yaml_files "./" "${OLD_PREFIX}${ENAS_CPU}:.*" "${NEW_PREFIX}${ENAS_CPU}:${TAG}"
Expand Down
10 changes: 5 additions & 5 deletions test/e2e/v1beta1/scripts/gh-actions/build-load.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ REGISTRY="docker.io/kubeflowkatib"
TAG="e2e-test"
VERSION="v1beta1"
CMD_PREFIX="cmd"
SPECIFIED_DEVICE_TYPE_IMAGES=("enas-cnn-cifar10-cpu" "darts-cnn-cifar10-cpu")
SPECIFIED_DEVICE_TYPE_IMAGES=("enas-cnn-cifar10-cpu" "darts-cnn-cifar10-cpu" "pytorch-mnist-cpu")

IFS="," read -r -a TRIAL_IMAGE_ARRAY <<< "$TRIAL_IMAGES"
IFS="," read -r -a EXPERIMENT_ARRAY <<< "$EXPERIMENTS"
Expand All @@ -51,7 +51,7 @@ _build_containers() {
docker build --platform "$(uname -m)" -t "$REGISTRY/$CONTAINER_NAME:$TAG" -f "../../../../../$DOCKERFILE" ../../../../../
}

_load_kind_cluster() {
_load_minikube_cluster() {
CONTAINER_NAME=${1:-"katib-controller"}

echo -e "\n\nLoading $CONTAINER_NAME image...\n\n"
Expand Down Expand Up @@ -99,7 +99,7 @@ run() {
for s in "${suggestions[@]}"; do
if [ "$s" == "$CONTAINER_NAME" ]; then
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
_load_kind_cluster "$CONTAINER_NAME"
_load_minikube_cluster "$CONTAINER_NAME"
break
fi
done
Expand All @@ -126,15 +126,15 @@ run() {
for e in "${earlystoppings[@]}"; do
if [ "$e" == "$CONTAINER_NAME" ]; then
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
_load_kind_cluster "$CONTAINER_NAME"
_load_minikube_cluster "$CONTAINER_NAME"
break
fi
done

# Others
else
_build_containers "$CONTAINER_NAME" "$DOCKERFILE"
_load_kind_cluster "$CONTAINER_NAME"
_load_minikube_cluster "$CONTAINER_NAME"
fi
}

Expand Down

0 comments on commit 12a4896

Please sign in to comment.