Skip to content

Commit

Permalink
Add OSS GPU tests
Browse files Browse the repository at this point in the history
ghstack-source-id: c427425d8c7392034dca50c55a8c58205ef0ef10
Pull Request resolved: #231
  • Loading branch information
PaliC committed Oct 26, 2022
1 parent 932450a commit 9235d12
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 22 deletions.
89 changes: 89 additions & 0 deletions .github/scripts/install_nvidia_utils_linux.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
et -eou pipefail


DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID)
DRIVER_VERSION="515.57"
DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"

install_nvidia_docker2_amzn2() {
(
set -x
# Needed for yum-config-manager
sudo yum install -y yum-utils
sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
sudo yum install -y nvidia-docker2
sudo systemctl restart docker
)
}

install_nvidia_driver_amzn2() {
(
set -x

# Purge any nvidia driver installed from RHEL repo
sudo yum remove -y nvidia-driver-latest-dkms

HAS_NVIDIA_DRIVER=0
# Check if NVIDIA driver has already been installed
if [ -x "$(command -v nvidia-smi)" ]; then
# The driver exists, check its version next
INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)

if [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
else
HAS_NVIDIA_DRIVER=1
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
fi
fi

if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
sudo yum groupinstall -y "Development Tools"
# ensure our kernel install is the same as our underlying kernel,
# groupinstall "Development Tools" has a habit of mismatching kernel headers
sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
sudo modprobe backlight
sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
sudo rm -fv /tmp/nvidia_driver
fi

(
set +e
nvidia-smi
status=$?
# Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
if [ $status -eq 0 ] || [ $status -eq 14 ]; then
echo "INFO: Ignoring allowed status ${status}"
else
echo "ERROR: nvidia-smi exited with unresolved status ${status}"
exit ${status}
fi
)
)
}

echo "== Installing nvidia driver ${DRIVER_FN} =="
case "${DISTRIBUTION}" in
amzn*)
install_nvidia_driver_amzn2
;;
*)
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
exit 1
;;
esac

# Install container toolkit based on distribution
echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
case "${DISTRIBUTION}" in
amzn*)
install_nvidia_docker2_amzn2
;;
*)
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
exit 1
;;
esac

20 changes: 16 additions & 4 deletions .github/workflows/runtime_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,27 @@ jobs:
matrix:
python-major-version: [3]
python-minor-version: [7,8,9,10]
platform: [ubuntu-18.04]
platform: [linux.4xlarge.nvidia.gpu]
fail-fast: false
runs-on: ${{ matrix.platform }}
steps:
- name: Checkout MultiPy
uses: actions/checkout@v2
with:
submodules: true

- name: Clean up previous CUDA driver installations
shell: bash
run: |
set -x
yum list installed | grep nvidia || true
yum list installed | grep cuda || true
sudo yum remove -y cuda || true
sudo yum remove -y cuda-drivers || true
sudo yum remove -y "*nvidia*" || true
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
run: |
bash .github/scripts/install_nvidia_utils_linux.sh || true
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
- name: Setup SSH (Click me for login details)
uses: ./.github/actions/setup-ssh
with:
Expand All @@ -30,11 +42,11 @@ jobs:
- name: Build
env:
DOCKER_BUILDKIT: 1
run: docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} .
run: nvidia-docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} --build-arg BUILD_CUDA_TESTS=1 .

- name: Test
run: |
docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy"
nvidia-docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy && multipy/runtime/build/test_deploy_gpu"
- name: Examples
run: |
Expand Down
26 changes: 12 additions & 14 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG BASE_IMAGE=nvidia/cuda:11.3.1-devel-ubuntu18.04
ARG BASE_IMAGE=nvidia/cuda:11.6.1-devel-ubuntu18.04

FROM ${BASE_IMAGE} as dev-base

Expand Down Expand Up @@ -59,13 +59,17 @@ COPY .git .git
COPY .gitmodules .gitmodules
COPY multipy multipy
COPY compat-requirements.txt compat-requirements.txt
COPY setup.py setup.py
COPY README.md README.md
COPY dev-requirements.txt dev-requirements.txt

RUN git submodule update --init --recursive --jobs 0

# Install conda/pyenv + necessary python dependencies
FROM dev-base as conda-pyenv
ARG PYTHON_MAJOR_VERSION=3
ARG PYTHON_MINOR_VERSION=8
ARG BUILD_CUDA_TESTS=0
ENV PYTHON_MINOR_VERSION=${PYTHON_MINOR_VERSION}
ENV PYTHON_VERSION=${PYTHON_MAJOR_VERSION}.${PYTHON_MINOR_VERSION}
RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
Expand All @@ -75,7 +79,7 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
rm ~/miniconda.sh && \
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} mkl mkl-include conda-build pyyaml numpy ipython && \
/opt/conda/bin/conda install -y -c conda-forge libpython-static=${PYTHON_VERSION} && \
/opt/conda/bin/conda install -y pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch-nightly && \
/opt/conda/bin/conda install -y pytorch torchvision torchaudio pytorch-cuda=11.6 -c pytorch-nightly -c nvidia && \
/opt/conda/bin/conda clean -ya; \
else \
pip3 install virtualenv && \
Expand All @@ -84,29 +88,23 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
~/.pyenv/bin/pyenv install --force 3.7.10 && \
virtualenv -p ~/.pyenv/versions/3.7.10/bin/python3 ~/venvs/multipy && \
source ~/venvs/multipy/bin/activate && \
pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113; \
pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu116; \
fi

# Build/Install pytorch with post-cxx11 ABI
FROM conda-pyenv as build
WORKDIR /opt/multipy/multipy/runtime/third-party/pytorch
COPY --from=conda-pyenv /opt/conda* /opt/conda
COPY --from=submodule-update /opt/multipy /opt/multipy

WORKDIR /opt/multipy

# Build Multipy
RUN rm -r multipy/runtime/build; mkdir multipy/runtime/build && \
cd multipy/runtime/build && \
if [[ ${PYTHON_MINOR_VERSION} -lt 8 ]]; then \
source ~/venvs/multipy/bin/activate && \
cmake -DLEGACY_PYTHON_PRE_3_8=ON ..; \
RUN ls && pwd && rm -rf multipy/runtime/build && \
if [[ ${BUILD_CUDA_TESTS} -eq 1 ]]; then \
python3 -m pip install -e . --install-option="--cudatests"; \
else \
cmake -DLEGACY_PYTHON_PRE_3_8=OFF ..; \
python3 -m pip install -e .; \
fi && \
cmake --build . --config Release -j && \
cmake --install . --prefix "." && \
cd ../example && python generate_examples.py
python multipy/runtime/example/generate_examples.py

# Build examples
COPY examples examples
Expand Down
12 changes: 8 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def get_cmake_version():


class MultipyRuntimeCmake(object):
user_options = [("cmakeoff", None, None), ("abicxx", None, None)]
user_options = [("cmakeoff", None, None), ("cudatests", None, None), ("abicxx", None, None)]


class MultipyRuntimeDevelop(MultipyRuntimeCmake, develop):
Expand All @@ -41,24 +41,28 @@ def initialize_options(self):
# TODO(tristanr): remove once unused
self.abicxx = None

self.cudatests = None
def finalize_options(self):
develop.finalize_options(self)
if self.cmakeoff is not None:
self.distribution.get_command_obj("build_ext").cmake_off = True
if self.cudatests is not None:
self.distribution.get_command_obj("build_ext").cuda_tests_flag = "ON"


class MultipyRuntimeBuild(MultipyRuntimeCmake, build_ext):
user_options = build_ext.user_options + MultipyRuntimeCmake.user_options
cmake_off = False
cuda_tests_flag = "OFF"

def run(self):
if self.cmake_off:
return
try:
cmake_version_comps = get_cmake_version().split(".")
if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "19":
if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "12":
raise RuntimeError(
"CMake 3.19 or later required for multipy runtime installation."
"CMake 3.12 or later required for multipy runtime installation."
)
except OSError:
raise RuntimeError(
Expand All @@ -74,7 +78,7 @@ def run(self):
print(f"-- Running multipy runtime makefile in dir {build_dir_abs}")
try:
subprocess.run(
[f"cmake -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."],
[f"cmake -DBUILD_CUDA_TESTS={self.cuda_tests_flag} -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."],
cwd=build_dir_abs,
shell=True,
check=True,
Expand Down

0 comments on commit 9235d12

Please sign in to comment.