Skip to content

Commit e34c886

Browse files
authored
Build torch and lightgbm from source. (#1083)
* Build torch and lightgbm from source. Fixes #984, #1059 Introduced a new "architecture" to easily build packages from source in the main build only if needed (i.e. if the base image or the package version has changed). This enable us to: - Upgrade PyTorch which doesn't have a wheel for 1.9.1 and CUDA 11. This prevented us from upgrading torch for ~6 months. - Move the lightgbm gpu source build to this architecture to shave ~3 minutes off the build time. http://b/181966788 * Add /usr/local/cuda/compat to LD_LIBRARY_PATH * Build torchaudio and torchtext from source * Increase torch build timeout
1 parent 7cf514a commit e34c886

File tree

8 files changed

+344
-19
lines changed

8 files changed

+344
-19
lines changed

Dockerfile.tmpl

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,33 @@
1+
ARG BASE_IMAGE_REPO
2+
ARG BASE_IMAGE_TAG
3+
ARG CPU_BASE_IMAGE_NAME
4+
ARG GPU_BASE_IMAGE_NAME
5+
ARG LIGHTGBM_VERSION
6+
ARG TORCH_VERSION
7+
ARG TORCHAUDIO_VERSION
8+
ARG TORCHTEXT_VERSION
9+
ARG TORCHVISION_VERSION
10+
111
{{ if eq .Accelerator "gpu" }}
2-
FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80
12+
FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl
13+
FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl
14+
FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
315
ENV CUDA_MAJOR_VERSION=11
416
ENV CUDA_MINOR_VERSION=0
517
{{ else }}
6-
FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m80
18+
FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
719
{{ end }}
820
# Keep these variables in sync if base image is updated.
921
ENV TENSORFLOW_VERSION=2.6.0
22+
23+
# We need to redefine the ARG here to get the ARG value defined above the FROM instruction.
24+
# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
25+
ARG LIGHTGBM_VERSION
26+
ARG TORCH_VERSION
27+
ARG TORCHAUDIO_VERSION
28+
ARG TORCHTEXT_VERSION
29+
ARG TORCHVISION_VERSION
30+
1031
# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0
1132
# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information
1233
ENV KMP_WARNINGS=0
@@ -15,6 +36,9 @@ ADD clean-layer.sh /tmp/clean-layer.sh
1536
ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
1637
ADD patches/template_conf.json /opt/kaggle/conf.json
1738

39+
# Adds the libcuda.so to LD_LIBRARY_PATH which is necessary for the GPU mxnet package.
40+
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/compat
41+
1842
{{ if eq .Accelerator "gpu" }}
1943
# b/200968891 Keeps horovod once torch is upgraded.
2044
RUN pip uninstall -y horovod && \
@@ -52,29 +76,24 @@ RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MI
5276

5377
# Install PyTorch
5478
{{ if eq .Accelerator "gpu" }}
55-
RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
79+
COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/
80+
RUN pip install /tmp/torch/*.whl && \
81+
rm -rf /tmp/torch && \
5682
/tmp/clean-layer.sh
5783
{{ else }}
58-
RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
84+
RUN pip install torch==$TORCH_VERSION+cpu torchvision==$TORCHVISION_VERSION+cpu torchaudio==$TORCHAUDIO_VERSION torchtext==$TORCHTEXT_VERSION -f https://download.pytorch.org/whl/torch_stable.html && \
5985
/tmp/clean-layer.sh
6086
{{ end }}
6187

6288
# Install LightGBM
63-
ENV LIGHTGBM_VERSION=3.2.1
6489
{{ if eq .Accelerator "gpu" }}
90+
COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/
6591
# Install OpenCL (required by LightGBM GPU version)
6692
RUN apt-get install -y ocl-icd-libopencl1 clinfo && \
6793
mkdir -p /etc/OpenCL/vendors && \
6894
echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \
69-
cd /usr/local/src && \
70-
git clone --recursive https://github.com/microsoft/LightGBM && \
71-
cd LightGBM && \
72-
git checkout tags/v$LIGHTGBM_VERSION && \
73-
mkdir build && cd build && \
74-
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
75-
make -j$(nproc) && \
76-
cd /usr/local/src/LightGBM/python-package && \
77-
python setup.py install --precompile && \
95+
pip install /tmp/lightgbm/*.whl && \
96+
rm -rf /tmp/lightgbm && \
7897
/tmp/clean-layer.sh
7998
{{ else }}
8099
RUN pip install lightgbm==$LIGHTGBM_VERSION && \
@@ -386,8 +405,7 @@ RUN pip install bleach && \
386405
pip install widgetsnbextension && \
387406
pip install pyarrow && \
388407
pip install feather-format && \
389-
# fastai >= 2.3.1 upgrades pytorch/torchvision. upgrade of pytorch will be handled in b/181966788
390-
pip install fastai==2.2.7 && \
408+
pip install fastai && \
391409
pip install allennlp && \
392410
# https://b.corp.google.com/issues/184685619#comment9: 3.9.0 is causing a major performance degradation with spacy 2.3.5
393411
pip install importlib-metadata==3.4.0 && \

Jenkinsfile

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,42 @@ pipeline {
3434
'''
3535
}
3636
}
37+
stage('Pre-build Packages from Source') {
38+
parallel {
39+
stage('torch') {
40+
options {
41+
timeout(time: 180, unit: 'MINUTES')
42+
}
43+
steps {
44+
sh '''#!/bin/bash
45+
set -exo pipefail
46+
source config.txt
47+
cd packages/
48+
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
49+
--package torch \
50+
--version $TORCH_VERSION \
51+
--build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \
52+
--build-arg TORCHTEXT_VERSION=$TORCHTEXT_VERSION \
53+
--build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \
54+
--push
55+
'''
56+
}
57+
}
58+
stage('lightgbm') {
59+
options {
60+
timeout(time: 10, unit: 'MINUTES')
61+
}
62+
steps {
63+
sh '''#!/bin/bash
64+
set -exo pipefail
65+
source config.txt
66+
cd packages/
67+
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG --package lightgbm --version $LIGHTGBM_VERSION --push
68+
'''
69+
}
70+
}
71+
}
72+
}
3773
stage('Build/Test/Diff') {
3874
parallel {
3975
stage('CPU') {
@@ -79,7 +115,7 @@ pipeline {
79115
}
80116
stage('GPU') {
81117
agent { label 'ephemeral-linux-gpu' }
82-
stages {
118+
stages {
83119
stage('Build GPU Image') {
84120
options {
85121
timeout(time: 120, unit: 'MINUTES')

build

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,18 @@ done
4747
BUILD_ARGS+=" --build-arg GIT_COMMIT=$(git rev-parse HEAD)"
4848
BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')"
4949

50+
# Read build args from config.txt file.
51+
SRCDIR=$(dirname "${BASH_SOURCE[0]}")
52+
for l in `cat ${SRCDIR}/config.txt`; do
53+
BUILD_ARGS+=" --build-arg $l"
54+
done
55+
5056
readonly CACHE_FLAG
5157
readonly DOCKERFILE
5258
readonly ACCELERATOR
5359
readonly IMAGE_TAG
5460
readonly BUILD_ARGS
5561

56-
57-
SRCDIR=$(dirname "${BASH_SOURCE[0]}")
5862
DOCKERFILE_OUTDIR="${SRCDIR}/.generated"
5963
mkdir -p $DOCKERFILE_OUTDIR
6064
DOCKERFILE_PATH="$DOCKERFILE_OUTDIR/$DOCKERFILE"

config.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
2+
BASE_IMAGE_TAG=m80
3+
CPU_BASE_IMAGE_NAME=tf2-cpu.2-6
4+
GPU_BASE_IMAGE_NAME=tf2-gpu.2-6
5+
LIGHTGBM_VERSION=3.2.1
6+
TORCH_VERSION=1.9.1
7+
TORCHAUDIO_VERSION=0.9.1
8+
TORCHTEXT_VERSION=0.10.1
9+
TORCHVISION_VERSION=0.10.1

packages/README.md

Whitespace-only changes.

packages/build_package

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/bin/bash
2+
set -e
3+
4+
usage() {
5+
cat << EOF
6+
Usage: $0 [OPTIONS]
7+
Build a new package ".whl".
8+
9+
Options:
10+
-p, --package PACKAGE Package to build (e.g. lightgbm).
11+
-v, --version VERSION Package version to build.
12+
-b, --base-image IMAGE Base image tag (e.g. m80).
13+
-c, --use-cache Use layer cache when building a new image.
14+
-f, --force-rebuild Rebuild the image regardless of whether it already exist on GCR.
15+
-u, --push Push image to GCR.
16+
--build-arg ARG=VALUE Build arguments to pass to the docker build command.
17+
EOF
18+
}
19+
20+
PACKAGE=''
21+
PACKAGE_VERSION=''
22+
BASE_IMAGE=''
23+
DOCKERFILE=''
24+
CACHE_FLAG='--no-cache'
25+
FORCE_REBUILD=false
26+
PUSH_TO_GCR=false
27+
BUILD_ARGS=''
28+
29+
while :; do
30+
case "$1" in
31+
-h|--help)
32+
usage
33+
exit
34+
;;
35+
-p|--package)
36+
if [[ -z $2 ]]; then
37+
usage
38+
printf 'ERROR: No IMAGE specified after the %s flag.\n' "$1" >&2
39+
exit 1
40+
fi
41+
PACKAGE=$2
42+
DOCKERFILE="${PACKAGE}.Dockerfile"
43+
shift # skip the flag value
44+
;;
45+
-v|--version)
46+
if [[ -z $2 ]]; then
47+
usage
48+
printf 'ERROR: No VERSION specified after the %s flag.\n' "$1" >&2
49+
exit 1
50+
fi
51+
PACKAGE_VERSION=$2
52+
shift # skip the flag value
53+
;;
54+
-t|--base-image)
55+
if [[ -z $2 ]]; then
56+
usage
57+
printf 'ERROR: No TAG specified after the %s flag.\n' "$1" >&2
58+
exit 1
59+
fi
60+
BASE_IMAGE=$2
61+
shift # skip the flag value
62+
;;
63+
-c|--use-cache)
64+
CACHE_FLAG=''
65+
;;
66+
-f|--force-rebuild)
67+
FORCE_REBUILD=true
68+
;;
69+
-u|--push)
70+
PUSH_TO_GCR=true
71+
;;
72+
--build-arg)
73+
if [[ -z $2 ]]; then
74+
usage
75+
printf 'ERROR: No ARG=VALUE specified after the %s flag.\n' "$1" >&2
76+
exit 1
77+
fi
78+
BUILD_ARGS+=" $1 $2"
79+
shift # skip the flag value
80+
;;
81+
-?*)
82+
usage
83+
printf 'ERROR: Unknown option: %s\n' "$1" >&2
84+
exit 1
85+
;;
86+
*)
87+
break
88+
esac
89+
90+
shift
91+
done
92+
93+
readonly PACKAGE
94+
readonly PACKAGE_VERSION
95+
readonly BASE_IMAGE
96+
readonly DOCKERFILE
97+
readonly CACHE_FLAG
98+
readonly FORCE_REBUILD
99+
100+
SRCDIR=$(dirname "${BASH_SOURCE[0]}")
101+
DOCKERFILE_PATH="$SRCDIR/$DOCKERFILE"
102+
103+
if [[ -z "$PACKAGE_VERSION" ]]; then
104+
printf 'ERROR: missing --version flag.\n'
105+
exit 1
106+
fi
107+
108+
if [[ -z "$BASE_IMAGE" ]]; then
109+
printf 'ERROR: missing --base-image flag.\n'
110+
exit 1
111+
fi
112+
113+
if [[ -z "$DOCKERFILE" ]]; then
114+
printf 'ERROR: missing --package flag.\n'
115+
exit 1
116+
fi
117+
118+
# Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80`
119+
TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//}
120+
# Replace the `:` in `tf2-gpu.2-6:m80` by `-`
121+
TAG=${TAG/:/-}
122+
# Append the package version
123+
TAG=$TAG-$PACKAGE_VERSION
124+
# Add the gcr repo.
125+
TAG=gcr.io/kaggle-images/python-$PACKAGE-whl:$TAG
126+
127+
SHOULD_BUILD=true
128+
if ! $FORCE_REBUILD; then
129+
echo "Checking if $TAG exists..."
130+
docker pull $TAG && SHOULD_BUILD=false
131+
fi
132+
133+
if $SHOULD_BUILD; then
134+
echo "Building $TAG..."
135+
docker build --rm --pull $BUILD_ARGS \
136+
$CACHE_FLAG \
137+
-t $TAG \
138+
-f "$DOCKERFILE_PATH" \
139+
--build-arg BASE_IMAGE=$BASE_IMAGE \
140+
--build-arg PACKAGE_VERSION=$PACKAGE_VERSION \
141+
$SRCDIR
142+
143+
if $PUSH_TO_GCR; then
144+
echo "Pushing $TAG to GCR..."
145+
docker push $TAG
146+
fi
147+
else
148+
echo "Skipping build. $TAG already exists."
149+
echo "Use --force-rebuild if you want to build a new version anyway."
150+
fi

packages/lightgbm.Dockerfile

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
ARG BASE_IMAGE
2+
3+
FROM ${BASE_IMAGE} AS builder
4+
5+
ARG PACKAGE_VERSION
6+
7+
# Build instructions: https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm
8+
RUN apt-get update && \
9+
apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev ocl-icd-libopencl1 clinfo
10+
11+
RUN cd /usr/local/src && \
12+
git clone --recursive https://github.com/microsoft/LightGBM && \
13+
cd LightGBM && \
14+
git checkout tags/v$PACKAGE_VERSION && \
15+
mkdir build && cd build && \
16+
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
17+
make -j$(nproc) && \
18+
cd /usr/local/src/LightGBM/python-package && \
19+
python setup.py bdist_wheel
20+
21+
# Using multi-stage builds to ensure the output image is very small
22+
# See: https://docs.docker.com/develop/develop-images/multistage-build/
23+
FROM alpine:latest
24+
25+
RUN mkdir -p /tmp/whl/
26+
COPY --from=builder /usr/local/src/LightGBM/python-package/dist/*.whl /tmp/whl
27+
28+
# Print out the built .whl file.
29+
RUN ls -lh /tmp/whl/

0 commit comments

Comments
 (0)