[CI] updgrade pytorch version for benchmark CI (dmlc#5102)

* [CI] updgrade pytorch version for benchmark CI * update build arguments * updage * updage * upgrade torch to 1.13 * updage docker image * update cmake args * try with cu116_torch112 * update build * update * update * update * update docker image * update * update * update * update * final update * fix continue running * update * update * update
XiaoYixin7132 · Jan 6, 2023 · 46a3fc2 · 46a3fc2
1 parent dde5cf5
commit 46a3fc2
Show file tree

Hide file tree

Showing 9 changed files with 31 additions and 45 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -134,6 +134,8 @@ def is_admin(name) {
   return (name in admins)
 }
 
+def regression_test_done = false
+
 pipeline {
   agent any
   triggers {
@@ -196,7 +198,6 @@ pipeline {
       }
       when { triggeredBy 'IssueCommentCause' }
       steps {
-        // container('dgl-ci-lint') {
           checkout scm
           script {
               def comment = env.GITHUB_COMMENT
@@ -229,12 +230,12 @@ pipeline {
               }
               pullRequest.comment("Finished the Regression test. Result table is at https://dgl-asv-data.s3-us-west-2.amazonaws.com/${env.GIT_COMMIT}_${instance_type}/results/result.csv. Jenkins job link is ${RUN_DISPLAY_URL}. ")
               currentBuild.result = 'SUCCESS'
-              return
+              regression_test_done = true
           }
-        // }
       }
     }
     stage('CI') {
+      when { expression { !regression_test_done } }
       stages {
         stage('Lint Check') {
           agent {

diff --git a/benchmarks/run.sh b/benchmarks/run.sh
@@ -9,7 +9,9 @@ ROOT=/asv/dgl
 
 conda activate base
 pip install --upgrade pip
-pip install asv
+# Newer asv version like 0.5.1 has different result format,
+# so we fix the version here. Or `generate_excel.py` has to be changed.
+pip install asv==0.4.2
 pip uninstall -y dgl
 
 export DGL_BENCH_DEVICE=$DEVICE

diff --git a/benchmarks/scripts/build_dgl_asv.sh b/benchmarks/scripts/build_dgl_asv.sh
@@ -2,28 +2,22 @@
 
 set -e
 
-# . /opt/conda/etc/profile.d/conda.sh
-# conda activate pytorch-ci
 # Default building only with cpu
 DEVICE=${DGL_BENCH_DEVICE:-cpu}
 
 pip install -r /asv/torch_gpu_pip.txt
-pip install pandas rdflib ogb
 
 # build
-if [[ $DEVICE == "cpu" ]]; then
-    CMAKE_VARS=""
-else
-    CMAKE_VARS="-DUSE_CUDA=ON"
+CMAKE_VARS="-DUSE_OPENMP=ON -DBUILD_TORCH=ON -DBUILD_SPARSE=ON -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda"
+if [[ $DEVICE == "gpu" ]]; then
+    CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
 fi
 arch=`uname -m`
 if [[ $arch == *"x86"* ]]; then
   CMAKE_VARS="-DUSE_AVX=ON $CMAKE_VARS"
 fi
 mkdir -p build
 pushd build
-cmake -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda -DBUILD_TORCH=ON $CMAKE_VARS ..
-make -j
+cmake $CMAKE_VARS ..
+make -j8
 popd
-
-# conda deactivate
diff --git a/benchmarks/scripts/generate_excel.py b/benchmarks/scripts/generate_excel.py
@@ -23,13 +23,9 @@ def get_branch_name_from_hash(hash):
 def main():
     results_path = Path("../results")
     results_path.is_dir()
-    benchmark_json_path = results_path / "benchmarks.json"
-    with benchmark_json_path.open() as f:
-        benchmark_json = json.load(f)
     machines = [f for f in results_path.glob("*") if f.is_dir()]
     output_results_dict = {}
     for machine in machines:
-        # commit_results_dict = {}
         per_machine_result = {}
         commit_results_json_paths = [
             f for f in machine.glob("*") if f.name != "machine.json"

diff --git a/benchmarks/scripts/install_dgl_asv.sh b/benchmarks/scripts/install_dgl_asv.sh
@@ -2,8 +2,6 @@
 
 set -e
 
-# . /opt/conda/etc/profile.d/conda.sh
-
 # install
 pushd python
 rm -rf build *.egg-info dist

diff --git a/benchmarks/scripts/publish.sh b/benchmarks/scripts/publish.sh
@@ -26,20 +26,20 @@ else
 fi
 
 WS_ROOT=/asv/dgl
-docker pull public.ecr.aws/s1o7b3d9/benchmakrk_pyg_dgl:cu111_torch181_pyg170
-if [ -z "$DGL_REG_CONF"]; then
+docker pull public.ecr.aws/s1o7b3d9/benchmark_test:cu116
+if [ -z "$DGL_REG_CONF" ]; then
     DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
 else
     DOCKER_ENV_OPT=" -e DGL_REG_CONF=$DGL_REG_CONF $DOCKER_ENV_OPT"
 fi
 
-if [ -z "$INSTANCE_TYPE"]; then
+if [ -z "$INSTANCE_TYPE" ]; then
     DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
 else
     DOCKER_ENV_OPT=" -e INSTANCE_TYPE=$INSTANCE_TYPE $DOCKER_ENV_OPT"
 fi
 
-if [ -z "$MOUNT_PATH"]; then
+if [ -z "$MOUNT_PATH" ]; then
     DOCKER_MOUNT_OPT=""
 else
     DOCKER_MOUNT_OPT="-v ${MOUNT_PATH}:/tmp/dataset -v ${MOUNT_PATH}/dgl_home/:/root/.dgl/"
@@ -56,16 +56,18 @@ if [[ $DEVICE == "cpu" ]]; then
         $DOCKER_MOUNT_OPT \
         $DOCKER_ENV_OPT \
         --shm-size="16g" \
-        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmakrk_pyg_dgl:cu111_torch181_pyg170 /bin/bash
+        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
 else
     docker run --name dgl-reg \
         --rm --gpus all \
         $DOCKER_MOUNT_OPT \
         $DOCKER_ENV_OPT \
         --shm-size="16g" \
-        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmakrk_pyg_dgl:cu111_torch181_pyg170 /bin/bash
+        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
 fi
 
+pwd
+
 docker exec dgl-reg mkdir -p $WS_ROOT
 docker cp ../../.git dgl-reg:$WS_ROOT
 docker cp ../ dgl-reg:$WS_ROOT/benchmarks/

diff --git a/benchmarks/scripts/torch_gpu_pip.txt b/benchmarks/scripts/torch_gpu_pip.txt
@@ -1,12 +1,13 @@
---find-links https://download.pytorch.org/whl/torch
-torch==1.9.0+cu111
-torchvision
+--find-links https://download.pytorch.org/whl/torch_stable.html
+torch==1.13.1+cu116
+torchvision==0.14.1+cu116
+torchmetrics
 pytest
 nose
 numpy
 cython
 scipy
-networkx==2.5.1
+networkx
 matplotlib
 nltk
 requests[security]
@@ -15,5 +16,4 @@ awscli
 torchtext
 pandas
 rdflib
-ogb==1.3.1
-torchmetrics
+ogb
diff --git a/docker/Dockerfile.ci_gpu_cu11 → docker/Dockerfile.ci_benchmark b/docker/Dockerfile.ci_gpu_cu11 → docker/Dockerfile.ci_benchmark
@@ -1,5 +1,8 @@
 # CI docker GPU env
-FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu16.04
+FROM nvidia/cuda:11.6.0-cudnn8-devel-ubuntu20.04
+
+ENV TZ=US
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 
 RUN apt-get update --fix-missing
 
@@ -15,16 +18,6 @@ RUN bash /install/ubuntu_install_conda.sh
 
 ENV CONDA_ALWAYS_YES="true"
 
-COPY install/conda_env/torch_gpu.yml /install/conda_env/torch_gpu.yml
-COPY install/conda_env/torch_gpu_pip_latest.txt /install/conda_env/torch_gpu_pip.txt
-RUN ["/bin/bash", "-i", "-c", "conda env create -f /install/conda_env/torch_gpu.yml"]
-
-# COPY install/conda_env/tensorflow_gpu.yml /install/conda_env/tensorflow_gpu.yml
-# RUN ["/bin/bash", "-i", "-c", "conda env create -f /install/conda_env/tensorflow_gpu.yml"]
-
-# COPY install/conda_env/mxnet_gpu.yml /install/conda_env/mxnet_gpu.yml
-# RUN ["/bin/bash", "-i", "-c", "conda env create -f /install/conda_env/mxnet_gpu.yml"]
-
 ENV CONDA_ALWAYS_YES=
 
 # Environment variables

diff --git a/docker/install/ubuntu_install_conda.sh b/docker/install/ubuntu_install_conda.sh
@@ -7,7 +7,7 @@ apt-get update --fix-missing && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \
+wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
     /bin/bash ~/miniconda.sh -b -p /opt/conda && \
     rm ~/miniconda.sh && \
     /opt/conda/bin/conda clean -tipsy && \