Skip to content

8274-mitigate-gpu-load-check #16092

8274-mitigate-gpu-load-check

8274-mitigate-gpu-load-check #16092

Workflow file for this run

# Jenkinsfile.monai-premerge
name: premerge-gpu
on:
# quick tests for pull requests and the releasing branches
push:
branches:
- main
- releasing/*
pull_request:
types: [opened, synchronize, closed]
concurrency:
# automatically cancel the previously triggered workflows when there's a newer version
group: build-gpu-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
GPU-quick-py3: # GPU with full dependencies
# if: ${{ github.repository == 'Project-MONAI/MONAI' && github.event.pull_request.merged != true }}
if: ${{ false }} # disable self-hosted job project-monai/monai#7039
strategy:
matrix:
environment:
- "PT19+CUDA114DOCKER"
- "PT110+CUDA111"
- "PT112+CUDA118DOCKER"
- "PT113+CUDA116"
- "PT210+CUDA121DOCKER"
include:
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes
- environment: PT110+CUDA111
pytorch: "torch==1.10.2 torchvision==0.11.3 --extra-index-url https://download.pytorch.org/whl/cu111"
base: "nvcr.io/nvidia/cuda:11.1.1-devel-ubuntu18.04"
- environment: PT112+CUDA118DOCKER
# 22.09: 1.13.0a0+d0d6b1f
pytorch: "-h" # we explicitly set pytorch to -h to avoid pip install error
base: "nvcr.io/nvidia/pytorch:22.09-py3"
- environment: PT113+CUDA116
pytorch: "torch==1.13.1 torchvision==0.14.1"
base: "nvcr.io/nvidia/cuda:11.6.1-devel-ubuntu18.04"
- environment: PT210+CUDA121DOCKER
# 23.08: 2.1.0a0+29c30b1
pytorch: "-h" # we explicitly set pytorch to -h to avoid pip install error
base: "nvcr.io/nvidia/pytorch:23.08-py3"
- environment: PT210+CUDA121DOCKER
# 24.08: 2.3.0a0+40ec155e58.nv24.3
pytorch: "-h" # we explicitly set pytorch to -h to avoid pip install error
base: "nvcr.io/nvidia/pytorch:24.08-py3"
container:
image: ${{ matrix.base }}
options: --gpus all --env NVIDIA_DISABLE_REQUIRE=true # workaround for unsatisfied condition: cuda>=11.6
runs-on: [self-hosted, linux, x64, common]
steps:
- uses: actions/checkout@v4
- name: apt install
if: github.event.pull_request.merged != true
run: |
apt-get update
apt-get install -y wget
if [ ${{ matrix.environment }} = "PT110+CUDA111" ] || \
[ ${{ matrix.environment }} = "PT113+CUDA116" ]
then
PYVER=3.9 PYSFX=3 DISTUTILS=python3-distutils && \
apt-get update && apt-get install -y --no-install-recommends \
curl \
pkg-config \
python$PYVER \
python$PYVER-dev \
python$PYSFX-pip \
$DISTUTILS \
rsync \
swig \
unzip \
zip \
zlib1g-dev \
libboost-locale-dev \
libboost-program-options-dev \
libboost-system-dev \
libboost-thread-dev \
libboost-test-dev \
libgoogle-glog-dev \
libjsoncpp-dev \
cmake \
git && \
rm -rf /var/lib/apt/lists/* && \
export PYTHONIOENCODING=utf-8 LC_ALL=C.UTF-8 && \
rm -f /usr/bin/python && \
rm -f /usr/bin/python`echo $PYVER | cut -c1-1` && \
ln -s /usr/bin/python$PYVER /usr/bin/python && \
ln -s /usr/bin/python$PYVER /usr/bin/python`echo $PYVER | cut -c1-1` &&
curl -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py && \
rm get-pip.py;
fi
- if: matrix.environment == 'PT19+CUDA114DOCKER'
name: Optional Cupy dependency (cuda114)
run: echo "cupy-cuda114" >> requirements-dev.txt
- name: Install dependencies
if: github.event.pull_request.merged != true
run: |
which python
python -m pip install --upgrade pip wheel
# fixes preinstalled ruamel_yaml error from the docker image
rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/ruamel*
rm -rf $(python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())")/llvmlite* #6377
python -m pip install ${{ matrix.pytorch }}
python -m pip install -r requirements-dev.txt
python -m pip list
- name: Run quick tests (GPU)
if: github.event.pull_request.merged != true
run: |
git clone --depth 1 \
https://github.com/Project-MONAI/MONAI-extra-test-data.git /MONAI-extra-test-data
export MONAI_EXTRA_TEST_DATA="/MONAI-extra-test-data"
nvidia-smi
export LAUNCH_DELAY=$(python -c "import numpy; print(numpy.random.randint(30) * 10)")
echo "Sleep $LAUNCH_DELAY"
sleep $LAUNCH_DELAY
export CUDA_VISIBLE_DEVICES=$(coverage run -m tests.utils | tail -n 1)
echo $CUDA_VISIBLE_DEVICES
trap 'if pgrep python; then pkill python; fi;' ERR
python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null &
python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))"
python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))'
python -c "import monai; monai.config.print_config()"
# build for the current self-hosted CI Tesla V100
BUILD_MONAI=1 TORCH_CUDA_ARCH_LIST="7.0" ./runtests.sh --build --disttests
./runtests.sh --quick --unittests
if [ ${{ matrix.environment }} = "PT113+CUDA116" ]; then
# test the clang-format tool downloading once
coverage run -m tests.clang_format_utils
fi
coverage xml --ignore-errors
if pgrep python; then pkill python; fi
shell: bash
- name: Upload coverage
if: ${{ github.head_ref != 'dev' && github.event.pull_request.merged != true }}
uses: codecov/codecov-action@v5
with:
files: ./coverage.xml