From 64750575a29b61650d05b5c8432c384ad2a00653 Mon Sep 17 00:00:00 2001 From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com> Date: Sat, 11 May 2024 16:21:05 +0800 Subject: [PATCH] [CI] upgrade torch to 2.3.0 and cuda to 12.1 (#7399) --- Jenkinsfile | 18 +++++++++--------- docker/Dockerfile.ci_gpu | 3 +-- docker/install/conda_env/torch_cpu_pip.txt | 2 +- docker/install/conda_env/torch_gpu_pip.txt | 2 +- tests/scripts/build_dgl.sh | 5 ++++- tests/scripts/task_unit_test.bat | 2 +- tests/scripts/task_unit_test.sh | 2 ++ 7 files changed, 19 insertions(+), 15 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0e374b2665dc..5dba5d139aea 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -319,7 +319,7 @@ pipeline { agent { docker { label "dgl-ci-linux-cpu" - image "dgllib/dgl-ci-cpu:v240227_1200" + image "dgllib/dgl-ci-cpu:v240511_1440" args "-u root" alwaysPull true } @@ -338,7 +338,7 @@ pipeline { agent { docker { label "dgl-ci-linux-cpu" - image "dgllib/dgl-ci-gpu:cu118_v240227_1200" + image "dgllib/dgl-ci-gpu:cu121_v240511_1440" args "-u root" alwaysPull true } @@ -393,7 +393,7 @@ pipeline { agent { docker { label "dgl-ci-linux-cpu" - image "dgllib/dgl-ci-cpu:v240227_1200" + image "dgllib/dgl-ci-cpu:v240511_1440" args "-u root" alwaysPull true } @@ -412,7 +412,7 @@ pipeline { agent { docker { label "dgl-ci-linux-gpu" - image "dgllib/dgl-ci-gpu:cu118_v240227_1200" + image "dgllib/dgl-ci-gpu:cu121_v240511_1440" args "-u root --runtime nvidia" alwaysPull true } @@ -467,7 +467,7 @@ pipeline { agent { docker { label "dgl-ci-linux-gpu" - image "dgllib/dgl-ci-gpu:cu118_v240227_1200" + image "dgllib/dgl-ci-gpu:cu121_v240511_1440" args "-u root --runtime nvidia" alwaysPull true } @@ -492,7 +492,7 @@ pipeline { agent { docker { label "dgl-ci-linux-cpu" - image "dgllib/dgl-ci-cpu:v240227_1200" + image "dgllib/dgl-ci-cpu:v240511_1440" args "-u root --shm-size=4gb" alwaysPull true } @@ -545,7 +545,7 @@ pipeline { agent { docker { label "dgl-ci-linux-gpu" - image "dgllib/dgl-ci-gpu:cu118_v240227_1200" + image "dgllib/dgl-ci-gpu:cu121_v240511_1440" args "-u root --runtime nvidia --shm-size=8gb" alwaysPull true } @@ -574,7 +574,7 @@ pipeline { agent { docker { label "dgl-ci-linux-cpu" - image "dgllib/dgl-ci-cpu:v240227_1200" + image "dgllib/dgl-ci-cpu:v240511_1440" args "-u root --shm-size=4gb" alwaysPull true } @@ -621,7 +621,7 @@ pipeline { agent { docker { label "dgl-ci-linux-cpu" - image "dgllib/dgl-ci-cpu:v240227_1200" + image "dgllib/dgl-ci-cpu:v240511_1440" args "-u root" alwaysPull true } diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu index 5573814f2c90..85eb23d2fbd9 100644 --- a/docker/Dockerfile.ci_gpu +++ b/docker/Dockerfile.ci_gpu @@ -1,5 +1,5 @@ # CI docker GPU env -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 ENV TZ=US RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone @@ -35,5 +35,4 @@ ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH} ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH} ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LIBRARY_PATH} ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH} -ENV CUDA_VISIBLE_DEVICES=0 ENV TF_FORCE_GPU_ALLOW_GROWTH=true diff --git a/docker/install/conda_env/torch_cpu_pip.txt b/docker/install/conda_env/torch_cpu_pip.txt index 178eeca247fd..9d2675a897c4 100644 --- a/docker/install/conda_env/torch_cpu_pip.txt +++ b/docker/install/conda_env/torch_cpu_pip.txt @@ -17,7 +17,7 @@ rdflib requests[security]==2.28 scikit-learn scipy -torch==2.0.0+cpu +torch==2.3.0+cpu torchdata torcheval torchmetrics diff --git a/docker/install/conda_env/torch_gpu_pip.txt b/docker/install/conda_env/torch_gpu_pip.txt index 51168e0cdcdf..38225aa1570d 100644 --- a/docker/install/conda_env/torch_gpu_pip.txt +++ b/docker/install/conda_env/torch_gpu_pip.txt @@ -15,7 +15,7 @@ rdflib requests[security]==2.28 scikit-learn scipy -torch==2.0.0+cu118 +torch==2.3.0+cu121 torchdata torcheval torchmetrics diff --git a/tests/scripts/build_dgl.sh b/tests/scripts/build_dgl.sh index 2de8cef7a62e..504b5545e2c2 100644 --- a/tests/scripts/build_dgl.sh +++ b/tests/scripts/build_dgl.sh @@ -8,7 +8,10 @@ if [ $# -ne 1 ]; then fi if [[ $1 != "cpu" ]]; then - CMAKE_VARS="$CMAKE_VARS -DUSE_CUDA=ON" + # CI is now running on g4dn instance. Specify target arch to avoid below + # error: Unknown CUDA Architecture Name 9.0a in CUDA_SELECT_NVCC_ARCH_FLAGS + export TORCH_CUDA_ARCH_LIST=7.5 # For dgl_sparse and tensoradaptor. + CMAKE_VARS="$CMAKE_VARS -DUSE_CUDA=ON -DCUDA_ARCH_NAME=Turing" # For graphbolt. fi # This is a semicolon-separated list of Python interpreters containing PyTorch. diff --git a/tests/scripts/task_unit_test.bat b/tests/scripts/task_unit_test.bat index 01aa5b253556..a6830c22977f 100644 --- a/tests/scripts/task_unit_test.bat +++ b/tests/scripts/task_unit_test.bat @@ -14,7 +14,7 @@ SET DGLBACKEND=!BACKEND! SET DGL_LIBRARY_PATH=!CD!\build SET DGL_DOWNLOAD_DIR=!CD!\_download -python -m pip install pytest psutil pandas pyyaml pydantic rdflib torchmetrics || EXIT /B 1 +python -m pip install pytest psutil pandas pyyaml pydantic rdflib torchmetrics expecttest || EXIT /B 1 python -m pytest -v --junitxml=pytest_backend.xml --durations=100 tests\python\!DGLBACKEND! || EXIT /B 1 python -m pytest -v --junitxml=pytest_common.xml --durations=100 tests\python\common || EXIT /B 1 ENDLOCAL diff --git a/tests/scripts/task_unit_test.sh b/tests/scripts/task_unit_test.sh index 906f97ecd620..2dae1594a4ba 100644 --- a/tests/scripts/task_unit_test.sh +++ b/tests/scripts/task_unit_test.sh @@ -33,6 +33,8 @@ fi conda activate ${DGLBACKEND}-ci +python3 -m pip install expecttest + if [ $DGLBACKEND == "mxnet" ] then python3 -m pytest -v --junitxml=pytest_compute.xml --durations=100 --ignore=tests/python/common/test_ffi.py tests/python/common || fail "common"