Build torch and lightgbm from source. (#1083)

rosbo · web-flow · commit e34c886a130d · 2021-10-04T18:04:50.000-07:00
* Build torch and lightgbm from source. Fixes #984, #1059 Introduced a new "architecture" to easily build packages from source in the main build only if needed (i.e. if the base image or the package version has changed). This enable us to: - Upgrade PyTorch which doesn't have a wheel for 1.9.1 and CUDA 11. This prevented us from upgrading torch for ~6 months. - Move the lightgbm gpu source build to this architecture to shave ~3 minutes off the build time. http://b/181966788 * Add /usr/local/cuda/compat to LD_LIBRARY_PATH * Build torchaudio and torchtext from source * Increase torch build timeout
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
@@ -1,12 +1,33 @@
+ARG BASE_IMAGE_REPO
+ARG BASE_IMAGE_TAG
+ARG CPU_BASE_IMAGE_NAME
+ARG GPU_BASE_IMAGE_NAME
+ARG LIGHTGBM_VERSION
+ARG TORCH_VERSION
+ARG TORCHAUDIO_VERSION
+ARG TORCHTEXT_VERSION
+ARG TORCHVISION_VERSION
+
 {{ if eq .Accelerator "gpu" }}
-FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80
+FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl
+FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl
+FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
 ENV CUDA_MAJOR_VERSION=11
 ENV CUDA_MINOR_VERSION=0
 {{ else }}
-FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m80
+FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
 {{ end }}
 # Keep these variables in sync if base image is updated.
 ENV TENSORFLOW_VERSION=2.6.0
+
+# We need to redefine the ARG here to get the ARG value defined above the FROM instruction.
+# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
+ARG LIGHTGBM_VERSION
+ARG TORCH_VERSION
+ARG TORCHAUDIO_VERSION
+ARG TORCHTEXT_VERSION
+ARG TORCHVISION_VERSION
+
 # Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0 
 # See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information
 ENV KMP_WARNINGS=0
@@ -15,6 +36,9 @@ ADD clean-layer.sh  /tmp/clean-layer.sh
 ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
 ADD patches/template_conf.json /opt/kaggle/conf.json
 
+# Adds the libcuda.so to LD_LIBRARY_PATH which is necessary for the GPU mxnet package.
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/compat
+
 {{ if eq .Accelerator "gpu" }}
 # b/200968891 Keeps horovod once torch is upgraded.
 RUN pip uninstall -y horovod && \
@@ -52,29 +76,24 @@ RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MI
 
 # Install PyTorch
 {{ if eq .Accelerator "gpu" }}
-RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
+COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/
+RUN pip install /tmp/torch/*.whl && \
+    rm -rf /tmp/torch && \
     /tmp/clean-layer.sh
 {{ else }}
-RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
+RUN pip install torch==$TORCH_VERSION+cpu torchvision==$TORCHVISION_VERSION+cpu torchaudio==$TORCHAUDIO_VERSION torchtext==$TORCHTEXT_VERSION -f https://download.pytorch.org/whl/torch_stable.html && \
     /tmp/clean-layer.sh
 {{ end }}
 
 # Install LightGBM
-ENV LIGHTGBM_VERSION=3.2.1
 {{ if eq .Accelerator "gpu" }}
+COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/
 # Install OpenCL (required by LightGBM GPU version)
 RUN apt-get install -y ocl-icd-libopencl1 clinfo && \
     mkdir -p /etc/OpenCL/vendors && \
     echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \
-    cd /usr/local/src && \
-    git clone --recursive https://github.com/microsoft/LightGBM && \
-    cd LightGBM && \
-    git checkout tags/v$LIGHTGBM_VERSION && \
-    mkdir build && cd build && \
-    cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
-    make -j$(nproc) && \
-    cd /usr/local/src/LightGBM/python-package && \
-    python setup.py install --precompile && \
+    pip install /tmp/lightgbm/*.whl && \
+    rm -rf /tmp/lightgbm && \
     /tmp/clean-layer.sh
 {{ else }}
 RUN pip install lightgbm==$LIGHTGBM_VERSION && \
@@ -386,8 +405,7 @@ RUN pip install bleach && \
     pip install widgetsnbextension && \
     pip install pyarrow && \
     pip install feather-format && \
-    # fastai >= 2.3.1 upgrades pytorch/torchvision. upgrade of pytorch will be handled in b/181966788
-    pip install fastai==2.2.7 && \
+    pip install fastai && \
     pip install allennlp && \
     # https://b.corp.google.com/issues/184685619#comment9: 3.9.0 is causing a major performance degradation with spacy 2.3.5
     pip install importlib-metadata==3.4.0 && \
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -34,6 +34,42 @@ pipeline {
         '''
       }
     }
+    stage('Pre-build Packages from Source') {
+      parallel {
+        stage('torch') {
+          options {
+            timeout(time: 180, unit: 'MINUTES')
+          }
+          steps {
+            sh '''#!/bin/bash
+              set -exo pipefail
+              source config.txt
+              cd packages/
+              ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
+                --package torch \
+                --version $TORCH_VERSION \
+                --build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \
+                --build-arg TORCHTEXT_VERSION=$TORCHTEXT_VERSION \
+                --build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \
+                --push
+            '''
+          }
+        }
+        stage('lightgbm') {
+          options {
+            timeout(time: 10, unit: 'MINUTES')
+          }
+          steps {
+            sh '''#!/bin/bash
+              set -exo pipefail
+              source config.txt
+              cd packages/
+              ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG --package lightgbm --version $LIGHTGBM_VERSION --push
+            '''
+          }
+        }
+      }
+    }
     stage('Build/Test/Diff') {
       parallel {
         stage('CPU') {
@@ -79,7 +115,7 @@ pipeline {
         }
         stage('GPU') {
           agent { label 'ephemeral-linux-gpu' }
-          stages {      
+          stages {  
             stage('Build GPU Image') {
               options {
                 timeout(time: 120, unit: 'MINUTES')
diff --git a/build b/build
@@ -47,14 +47,18 @@ done
 BUILD_ARGS+=" --build-arg GIT_COMMIT=$(git rev-parse HEAD)"
 BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')"
 
+# Read build args from config.txt file.
+SRCDIR=$(dirname "${BASH_SOURCE[0]}")
+for l in `cat ${SRCDIR}/config.txt`; do
+    BUILD_ARGS+=" --build-arg $l"
+done
+
 readonly CACHE_FLAG
 readonly DOCKERFILE
 readonly ACCELERATOR
 readonly IMAGE_TAG
 readonly BUILD_ARGS
 
-
-SRCDIR=$(dirname "${BASH_SOURCE[0]}")
 DOCKERFILE_OUTDIR="${SRCDIR}/.generated"
 mkdir -p $DOCKERFILE_OUTDIR
 DOCKERFILE_PATH="$DOCKERFILE_OUTDIR/$DOCKERFILE"
diff --git a/config.txt b/config.txt
@@ -0,0 +1,9 @@
+BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
+BASE_IMAGE_TAG=m80
+CPU_BASE_IMAGE_NAME=tf2-cpu.2-6
+GPU_BASE_IMAGE_NAME=tf2-gpu.2-6
+LIGHTGBM_VERSION=3.2.1
+TORCH_VERSION=1.9.1
+TORCHAUDIO_VERSION=0.9.1
+TORCHTEXT_VERSION=0.10.1
+TORCHVISION_VERSION=0.10.1
diff --git a/packages/README.md b/packages/README.md
diff --git a/packages/build_package b/packages/build_package
@@ -0,0 +1,150 @@
+#!/bin/bash
+set -e
+
+usage() {
+cat << EOF
+Usage: $0 [OPTIONS]
+Build a new package ".whl".
+
+Options:
+    -p, --package PACKAGE     Package to build (e.g. lightgbm).
+    -v, --version VERSION     Package version to build.
+    -b, --base-image IMAGE    Base image tag (e.g. m80).
+    -c, --use-cache           Use layer cache when building a new image.
+    -f, --force-rebuild       Rebuild the image regardless of whether it already exist on GCR.
+    -u, --push                Push image to GCR.
+    --build-arg ARG=VALUE     Build arguments to pass to the docker build command.
+EOF
+}
+
+PACKAGE=''
+PACKAGE_VERSION=''
+BASE_IMAGE=''
+DOCKERFILE=''
+CACHE_FLAG='--no-cache'
+FORCE_REBUILD=false
+PUSH_TO_GCR=false
+BUILD_ARGS=''
+
+while :; do
+    case "$1" in 
+        -h|--help)
+            usage
+            exit
+            ;;
+        -p|--package)
+            if [[ -z $2 ]]; then
+                usage
+                printf 'ERROR: No IMAGE specified after the %s flag.\n' "$1" >&2
+                exit 1
+            fi
+            PACKAGE=$2
+            DOCKERFILE="${PACKAGE}.Dockerfile"
+            shift # skip the flag value
+            ;;
+        -v|--version)
+            if [[ -z $2 ]]; then
+                usage
+                printf 'ERROR: No VERSION specified after the %s flag.\n' "$1" >&2
+                exit 1
+            fi
+            PACKAGE_VERSION=$2
+            shift # skip the flag value
+            ;;
+        -t|--base-image)
+            if [[ -z $2 ]]; then
+                usage
+                printf 'ERROR: No TAG specified after the %s flag.\n' "$1" >&2
+                exit 1
+            fi
+            BASE_IMAGE=$2
+            shift # skip the flag value
+            ;;
+        -c|--use-cache)
+            CACHE_FLAG=''
+            ;;
+        -f|--force-rebuild)
+            FORCE_REBUILD=true
+            ;;
+        -u|--push)
+            PUSH_TO_GCR=true
+            ;;
+        --build-arg)
+            if [[ -z $2 ]]; then
+                usage
+                printf 'ERROR: No ARG=VALUE specified after the %s flag.\n' "$1" >&2
+                exit 1
+            fi
+            BUILD_ARGS+=" $1 $2"
+            shift # skip the flag value
+            ;;
+        -?*)
+            usage
+            printf 'ERROR: Unknown option: %s\n' "$1" >&2
+            exit 1
+            ;;
+        *)            
+            break
+    esac
+
+    shift
+done
+
+readonly PACKAGE
+readonly PACKAGE_VERSION
+readonly BASE_IMAGE
+readonly DOCKERFILE
+readonly CACHE_FLAG
+readonly FORCE_REBUILD
+
+SRCDIR=$(dirname "${BASH_SOURCE[0]}")
+DOCKERFILE_PATH="$SRCDIR/$DOCKERFILE"
+
+if [[ -z "$PACKAGE_VERSION" ]]; then
+    printf 'ERROR: missing --version flag.\n'
+    exit 1
+fi
+
+if [[ -z "$BASE_IMAGE" ]]; then
+    printf 'ERROR: missing --base-image flag.\n'
+    exit 1
+fi
+
+if [[ -z "$DOCKERFILE" ]]; then
+    printf 'ERROR: missing --package flag.\n'
+    exit 1
+fi
+
+# Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80` 
+TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//}
+# Replace the `:` in `tf2-gpu.2-6:m80` by `-`
+TAG=${TAG/:/-}
+# Append the package version
+TAG=$TAG-$PACKAGE_VERSION
+# Add the gcr repo.
+TAG=gcr.io/kaggle-images/python-$PACKAGE-whl:$TAG
+
+SHOULD_BUILD=true
+if ! $FORCE_REBUILD; then
+    echo "Checking if $TAG exists..."
+    docker pull $TAG && SHOULD_BUILD=false
+fi
+
+if $SHOULD_BUILD; then
+    echo "Building $TAG..."
+    docker build --rm --pull $BUILD_ARGS \
+        $CACHE_FLAG \
+        -t $TAG \
+        -f "$DOCKERFILE_PATH" \
+        --build-arg BASE_IMAGE=$BASE_IMAGE \
+        --build-arg PACKAGE_VERSION=$PACKAGE_VERSION \
+        $SRCDIR
+    
+    if $PUSH_TO_GCR; then
+        echo "Pushing $TAG to GCR..."
+        docker push $TAG
+    fi
+else
+    echo "Skipping build. $TAG already exists."
+    echo "Use --force-rebuild if you want to build a new version anyway."
+fi
diff --git a/packages/lightgbm.Dockerfile b/packages/lightgbm.Dockerfile
@@ -0,0 +1,29 @@
+ARG BASE_IMAGE
+
+FROM ${BASE_IMAGE} AS builder
+
+ARG PACKAGE_VERSION
+
+# Build instructions: https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm
+RUN apt-get update && \
+    apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev ocl-icd-libopencl1 clinfo
+
+RUN cd /usr/local/src && \
+    git clone --recursive https://github.com/microsoft/LightGBM && \
+    cd LightGBM && \
+    git checkout tags/v$PACKAGE_VERSION && \
+    mkdir build && cd build && \
+    cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
+    make -j$(nproc) && \
+    cd /usr/local/src/LightGBM/python-package && \
+    python setup.py bdist_wheel
+
+# Using multi-stage builds to ensure the output image is very small
+# See: https://docs.docker.com/develop/develop-images/multistage-build/
+FROM alpine:latest
+
+RUN mkdir -p /tmp/whl/
+COPY --from=builder /usr/local/src/LightGBM/python-package/dist/*.whl /tmp/whl
+
+# Print out the built .whl file.
+RUN ls -lh /tmp/whl/
diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile