Skip to content

Commit

Permalink
[CI] updgrade pytorch version for benchmark CI (dmlc#5102)
Browse files Browse the repository at this point in the history
* [CI] updgrade pytorch version for benchmark CI

* update build arguments

* updage

* updage

* upgrade torch to 1.13

* updage docker image

* update cmake args

* try with cu116_torch112

* update build

* update

* update

* update

* update docker image

* update

* update

* update

* update

* final update

* fix continue running

* update

* update

* update
  • Loading branch information
Rhett-Ying authored Jan 6, 2023
1 parent dde5cf5 commit 46a3fc2
Show file tree
Hide file tree
Showing 9 changed files with 31 additions and 45 deletions.
7 changes: 4 additions & 3 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ def is_admin(name) {
return (name in admins)
}

def regression_test_done = false

pipeline {
agent any
triggers {
Expand Down Expand Up @@ -196,7 +198,6 @@ pipeline {
}
when { triggeredBy 'IssueCommentCause' }
steps {
// container('dgl-ci-lint') {
checkout scm
script {
def comment = env.GITHUB_COMMENT
Expand Down Expand Up @@ -229,12 +230,12 @@ pipeline {
}
pullRequest.comment("Finished the Regression test. Result table is at https://dgl-asv-data.s3-us-west-2.amazonaws.com/${env.GIT_COMMIT}_${instance_type}/results/result.csv. Jenkins job link is ${RUN_DISPLAY_URL}. ")
currentBuild.result = 'SUCCESS'
return
regression_test_done = true
}
// }
}
}
stage('CI') {
when { expression { !regression_test_done } }
stages {
stage('Lint Check') {
agent {
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ ROOT=/asv/dgl

conda activate base
pip install --upgrade pip
pip install asv
# Newer asv version like 0.5.1 has different result format,
# so we fix the version here. Or `generate_excel.py` has to be changed.
pip install asv==0.4.2
pip uninstall -y dgl

export DGL_BENCH_DEVICE=$DEVICE
Expand Down
16 changes: 5 additions & 11 deletions benchmarks/scripts/build_dgl_asv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,22 @@

set -e

# . /opt/conda/etc/profile.d/conda.sh
# conda activate pytorch-ci
# Default building only with cpu
DEVICE=${DGL_BENCH_DEVICE:-cpu}

pip install -r /asv/torch_gpu_pip.txt
pip install pandas rdflib ogb

# build
if [[ $DEVICE == "cpu" ]]; then
CMAKE_VARS=""
else
CMAKE_VARS="-DUSE_CUDA=ON"
CMAKE_VARS="-DUSE_OPENMP=ON -DBUILD_TORCH=ON -DBUILD_SPARSE=ON -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda"
if [[ $DEVICE == "gpu" ]]; then
CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
fi
arch=`uname -m`
if [[ $arch == *"x86"* ]]; then
CMAKE_VARS="-DUSE_AVX=ON $CMAKE_VARS"
fi
mkdir -p build
pushd build
cmake -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda -DBUILD_TORCH=ON $CMAKE_VARS ..
make -j
cmake $CMAKE_VARS ..
make -j8
popd

# conda deactivate
4 changes: 0 additions & 4 deletions benchmarks/scripts/generate_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,9 @@ def get_branch_name_from_hash(hash):
def main():
results_path = Path("../results")
results_path.is_dir()
benchmark_json_path = results_path / "benchmarks.json"
with benchmark_json_path.open() as f:
benchmark_json = json.load(f)
machines = [f for f in results_path.glob("*") if f.is_dir()]
output_results_dict = {}
for machine in machines:
# commit_results_dict = {}
per_machine_result = {}
commit_results_json_paths = [
f for f in machine.glob("*") if f.name != "machine.json"
Expand Down
2 changes: 0 additions & 2 deletions benchmarks/scripts/install_dgl_asv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

set -e

# . /opt/conda/etc/profile.d/conda.sh

# install
pushd python
rm -rf build *.egg-info dist
Expand Down
14 changes: 8 additions & 6 deletions benchmarks/scripts/publish.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,20 @@ else
fi

WS_ROOT=/asv/dgl
docker pull public.ecr.aws/s1o7b3d9/benchmakrk_pyg_dgl:cu111_torch181_pyg170
if [ -z "$DGL_REG_CONF"]; then
docker pull public.ecr.aws/s1o7b3d9/benchmark_test:cu116
if [ -z "$DGL_REG_CONF" ]; then
DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
else
DOCKER_ENV_OPT=" -e DGL_REG_CONF=$DGL_REG_CONF $DOCKER_ENV_OPT"
fi

if [ -z "$INSTANCE_TYPE"]; then
if [ -z "$INSTANCE_TYPE" ]; then
DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
else
DOCKER_ENV_OPT=" -e INSTANCE_TYPE=$INSTANCE_TYPE $DOCKER_ENV_OPT"
fi

if [ -z "$MOUNT_PATH"]; then
if [ -z "$MOUNT_PATH" ]; then
DOCKER_MOUNT_OPT=""
else
DOCKER_MOUNT_OPT="-v ${MOUNT_PATH}:/tmp/dataset -v ${MOUNT_PATH}/dgl_home/:/root/.dgl/"
Expand All @@ -56,16 +56,18 @@ if [[ $DEVICE == "cpu" ]]; then
$DOCKER_MOUNT_OPT \
$DOCKER_ENV_OPT \
--shm-size="16g" \
--hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmakrk_pyg_dgl:cu111_torch181_pyg170 /bin/bash
--hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
else
docker run --name dgl-reg \
--rm --gpus all \
$DOCKER_MOUNT_OPT \
$DOCKER_ENV_OPT \
--shm-size="16g" \
--hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmakrk_pyg_dgl:cu111_torch181_pyg170 /bin/bash
--hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
fi

pwd

docker exec dgl-reg mkdir -p $WS_ROOT
docker cp ../../.git dgl-reg:$WS_ROOT
docker cp ../ dgl-reg:$WS_ROOT/benchmarks/
Expand Down
12 changes: 6 additions & 6 deletions benchmarks/scripts/torch_gpu_pip.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
--find-links https://download.pytorch.org/whl/torch
torch==1.9.0+cu111
torchvision
--find-links https://download.pytorch.org/whl/torch_stable.html
torch==1.13.1+cu116
torchvision==0.14.1+cu116
torchmetrics
pytest
nose
numpy
cython
scipy
networkx==2.5.1
networkx
matplotlib
nltk
requests[security]
Expand All @@ -15,5 +16,4 @@ awscli
torchtext
pandas
rdflib
ogb==1.3.1
torchmetrics
ogb
15 changes: 4 additions & 11 deletions docker/Dockerfile.ci_gpu_cu11 → docker/Dockerfile.ci_benchmark
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# CI docker GPU env
FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu16.04
FROM nvidia/cuda:11.6.0-cudnn8-devel-ubuntu20.04

ENV TZ=US
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

RUN apt-get update --fix-missing

Expand All @@ -15,16 +18,6 @@ RUN bash /install/ubuntu_install_conda.sh

ENV CONDA_ALWAYS_YES="true"

COPY install/conda_env/torch_gpu.yml /install/conda_env/torch_gpu.yml
COPY install/conda_env/torch_gpu_pip_latest.txt /install/conda_env/torch_gpu_pip.txt
RUN ["/bin/bash", "-i", "-c", "conda env create -f /install/conda_env/torch_gpu.yml"]

# COPY install/conda_env/tensorflow_gpu.yml /install/conda_env/tensorflow_gpu.yml
# RUN ["/bin/bash", "-i", "-c", "conda env create -f /install/conda_env/tensorflow_gpu.yml"]

# COPY install/conda_env/mxnet_gpu.yml /install/conda_env/mxnet_gpu.yml
# RUN ["/bin/bash", "-i", "-c", "conda env create -f /install/conda_env/mxnet_gpu.yml"]

ENV CONDA_ALWAYS_YES=

# Environment variables
Expand Down
2 changes: 1 addition & 1 deletion docker/install/ubuntu_install_conda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ apt-get update --fix-missing && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh -O ~/miniconda.sh && \
wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
/bin/bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda clean -tipsy && \
Expand Down

0 comments on commit 46a3fc2

Please sign in to comment.