8274-mitigate-gpu-load-check #16092
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Jenkinsfile.monai-premerge | |
name: premerge-gpu | |
on: | |
# quick tests for pull requests and the releasing branches | |
push: | |
branches: | |
- main | |
- releasing/* | |
pull_request: | |
types: [opened, synchronize, closed] | |
concurrency: | |
# automatically cancel the previously triggered workflows when there's a newer version | |
group: build-gpu-${{ github.event.pull_request.number || github.ref }} | |
cancel-in-progress: true | |
jobs: | |
GPU-quick-py3: # GPU with full dependencies | |
# if: ${{ github.repository == 'Project-MONAI/MONAI' && github.event.pull_request.merged != true }} | |
if: ${{ false }} # disable self-hosted job project-monai/monai#7039 | |
strategy: | |
matrix: | |
environment: | |
- "PT19+CUDA114DOCKER" | |
- "PT110+CUDA111" | |
- "PT112+CUDA118DOCKER" | |
- "PT113+CUDA116" | |
- "PT210+CUDA121DOCKER" | |
include: | |
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes | |
- environment: PT110+CUDA111 | |
pytorch: "torch==1.10.2 torchvision==0.11.3 --extra-index-url https://download.pytorch.org/whl/cu111" | |
base: "nvcr.io/nvidia/cuda:11.1.1-devel-ubuntu18.04" | |
- environment: PT112+CUDA118DOCKER | |
# 22.09: 1.13.0a0+d0d6b1f | |
pytorch: "-h" # we explicitly set pytorch to -h to avoid pip install error | |
base: "nvcr.io/nvidia/pytorch:22.09-py3" | |
- environment: PT113+CUDA116 | |
pytorch: "torch==1.13.1 torchvision==0.14.1" | |
base: "nvcr.io/nvidia/cuda:11.6.1-devel-ubuntu18.04" | |
- environment: PT210+CUDA121DOCKER | |
# 23.08: 2.1.0a0+29c30b1 | |
pytorch: "-h" # we explicitly set pytorch to -h to avoid pip install error | |
base: "nvcr.io/nvidia/pytorch:23.08-py3" | |
- environment: PT210+CUDA121DOCKER | |
# 24.08: 2.3.0a0+40ec155e58.nv24.3 | |
pytorch: "-h" # we explicitly set pytorch to -h to avoid pip install error | |
base: "nvcr.io/nvidia/pytorch:24.08-py3" | |
container: | |
image: ${{ matrix.base }} | |
options: --gpus all --env NVIDIA_DISABLE_REQUIRE=true # workaround for unsatisfied condition: cuda>=11.6 | |
runs-on: [self-hosted, linux, x64, common] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: apt install | |
if: github.event.pull_request.merged != true | |
run: | | |
apt-get update | |
apt-get install -y wget | |
if [ ${{ matrix.environment }} = "PT110+CUDA111" ] || \ | |
[ ${{ matrix.environment }} = "PT113+CUDA116" ] | |
then | |
PYVER=3.9 PYSFX=3 DISTUTILS=python3-distutils && \ | |
apt-get update && apt-get install -y --no-install-recommends \ | |
curl \ | |
pkg-config \ | |
python$PYVER \ | |
python$PYVER-dev \ | |
python$PYSFX-pip \ | |
$DISTUTILS \ | |
rsync \ | |
swig \ | |
unzip \ | |
zip \ | |
zlib1g-dev \ | |
libboost-locale-dev \ | |
libboost-program-options-dev \ | |
libboost-system-dev \ | |
libboost-thread-dev \ | |
libboost-test-dev \ | |
libgoogle-glog-dev \ | |
libjsoncpp-dev \ | |
cmake \ | |
git && \ | |
rm -rf /var/lib/apt/lists/* && \ | |
export PYTHONIOENCODING=utf-8 LC_ALL=C.UTF-8 && \ | |
rm -f /usr/bin/python && \ | |
rm -f /usr/bin/python`echo $PYVER | cut -c1-1` && \ | |
ln -s /usr/bin/python$PYVER /usr/bin/python && \ | |
ln -s /usr/bin/python$PYVER /usr/bin/python`echo $PYVER | cut -c1-1` && | |
curl -O https://bootstrap.pypa.io/get-pip.py && \ | |
python get-pip.py && \ | |
rm get-pip.py; | |
fi | |
- if: matrix.environment == 'PT19+CUDA114DOCKER' | |
name: Optional Cupy dependency (cuda114) | |
run: echo "cupy-cuda114" >> requirements-dev.txt | |
- name: Install dependencies | |
if: github.event.pull_request.merged != true | |
run: | | |
which python | |
python -m pip install --upgrade pip wheel | |
# fixes preinstalled ruamel_yaml error from the docker image | |
rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/ruamel* | |
rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/llvmlite* #6377 | |
python -m pip install ${{ matrix.pytorch }} | |
python -m pip install -r requirements-dev.txt | |
python -m pip list | |
- name: Run quick tests (GPU) | |
if: github.event.pull_request.merged != true | |
run: | | |
git clone --depth 1 \ | |
https://github.com/Project-MONAI/MONAI-extra-test-data.git /MONAI-extra-test-data | |
export MONAI_EXTRA_TEST_DATA="/MONAI-extra-test-data" | |
nvidia-smi | |
export LAUNCH_DELAY=$(python -c "import numpy; print(numpy.random.randint(30) * 10)") | |
echo "Sleep $LAUNCH_DELAY" | |
sleep $LAUNCH_DELAY | |
export CUDA_VISIBLE_DEVICES=$(coverage run -m tests.utils | tail -n 1) | |
echo $CUDA_VISIBLE_DEVICES | |
trap 'if pgrep python; then pkill python; fi;' ERR | |
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & | |
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" | |
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' | |
python -c "import monai; monai.config.print_config()" | |
# build for the current self-hosted CI Tesla V100 | |
BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --build --disttests | |
./runtests.sh --quick --unittests | |
if [ ${{ matrix.environment }} = "PT113+CUDA116" ]; then | |
# test the clang-format tool downloading once | |
coverage run -m tests.clang_format_utils | |
fi | |
coverage xml --ignore-errors | |
if pgrep python; then pkill python; fi | |
shell: bash | |
- name: Upload coverage | |
if: ${{ github.head_ref != 'dev' && github.event.pull_request.merged != true }} | |
uses: codecov/codecov-action@v5 | |
with: | |
files: ./coverage.xml |