diff --git a/.github/container/Dockerfile.base b/.github/container/Dockerfile.base index 4850f0635..108874a70 100644 --- a/.github/container/Dockerfile.base +++ b/.github/container/Dockerfile.base @@ -1,27 +1,10 @@ # syntax=docker/dockerfile:1-labs -ARG BASE_IMAGE=nvidia/cuda:12.6.3-devel-ubuntu24.04 +ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:24.11-cuda12.6-devel-ubuntu24.04 ARG GIT_USER_NAME="JAX Toolbox" ARG GIT_USER_EMAIL=jax@nvidia.com ARG CLANG_VERSION=18 ARG JAX_TOOLBOX_REF -############################################################################### -## Obtain GCP's NCCL TCPx plugin -############################################################################### - -FROM us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx:v3.1.10 AS tcpx-installer-amd64 - -# make a stub arm64 container because GCP does not provide an arm64 version of the plugin -FROM ubuntu AS tcpx-installer-arm64 -RUN <<"OUTEREOF" bash -ex -mkdir -p /scripts /var/lib/tcpx/lib64 -echo '#!/bin/bash' > /scripts/container_entry.sh -chmod +x /scripts/container_entry.sh -OUTEREOF - -FROM tcpx-installer-${TARGETARCH} AS tcpx-installer -RUN /scripts/container_entry.sh install - ############################################################################### ## Build base image ############################################################################### @@ -153,50 +136,18 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1 RUN pip install --upgrade --ignore-installed --no-cache-dir -e /opt/pip pip-tools && rm -rf ~/.cache/* ############################################################################### -## Install TCPx -############################################################################### - -ENV TCPX_LIBRARY_PATH=/usr/local/tcpx/lib64 -COPY --from=tcpx-installer /var/lib/tcpx/lib64 ${TCPX_LIBRARY_PATH} - -############################################################################### -## Install the latest versions of Nsight Systems and Nsight Compute -############################################################################### - -ADD install-nsight.sh /usr/local/bin -RUN install-nsight.sh - -############################################################################### -## Install cuDNN +## Symlink for cuDNN ############################################################################### -ADD install-cudnn.sh /usr/local/bin -RUN install-cudnn.sh +ADD symlnk-cudnn.sh /usr/local/bin +RUN symlnk-cudnn.sh ############################################################################### -## Install NCCL +## Symlink for NCCL ############################################################################### -ADD install-nccl.sh /usr/local/bin -RUN install-nccl.sh - -############################################################################### -## RoCE and InfiniteBand support -############################################################################### - -ADD install-ofed.sh /usr/local/bin -RUN install-ofed.sh - -############################################################################## -## Amazon EFA support (need to run it inside container separately) -############################################################################## - -ADD --chmod=777 \ - install-efa.sh \ - test-aws-efa.sh \ - /usr/local/bin/ -ENV LD_LIBRARY_PATH=/opt/amazon/efa/lib:${LD_LIBRARY_PATH} -ENV PATH=/opt/amazon/efa/bin:${PATH} +ADD symlnk-nccl.sh /usr/local/bin +RUN symlnk-nccl.sh ############################################################################## ## NCCL sanity check utility @@ -207,18 +158,6 @@ ADD nccl-sanity-check.cu /opt RUN install-nccl-sanity-check.sh ADD jax-nccl-test parallel-launch /usr/local/bin/ -############################################################################### -## Add the systemcheck to the entrypoint. -############################################################################### - -COPY check-shm.sh /opt/nvidia/entrypoint.d/ - -############################################################################### -## Add the GCP - TCPX check to the entrypoint. -############################################################################### - -# TODO(chaserileyroberts): Reenable once fully tested on GCP. -# COPY gcp-autoconfig.sh /opt/nvidia/entrypoint.d/ ############################################################################### ## Install the nsys-jax JAX/XLA-aware profiling scripts, patch Nsight Systems diff --git a/.github/container/check-shm.sh b/.github/container/check-shm.sh deleted file mode 100755 index c73b7095c..000000000 --- a/.github/container/check-shm.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -minimum_shm_size=1048576 # ~1GB in KBs - -# Grab the second line / second field of the output of `df`, -# which is the size of shm in KBs. -actual_shm_size=$(df /dev/shm | awk 'NR==2 {print $2}') - -if (( actual_shm_size < minimum_shm_size )); then - YELLOW='\033[0;33m' - NOCOLOR='\033[0m' - - echo -e "${YELLOW} -WARNING: Your shm is currenly less than 1GB. This may cause SIGBUS errors. -To avoid this problem, you can manually set the shm size in docker with: - - $ docker run ... --shm-size=1g ... -${NOCOLOR}" -fi diff --git a/.github/container/install-cudnn.sh b/.github/container/install-cudnn.sh deleted file mode 100755 index 1155c2a9d..000000000 --- a/.github/container/install-cudnn.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -set -ex - -export DEBIAN_FRONTEND=noninteractive -export TZ=America/Los_Angeles - -CUDNN_MAJOR_VERSION=9 - -apt-get update - -# Extract major CUDA version from `nvcc --version` output line -# Input: "Cuda compilation tools, release X.Y, VX.Y.Z" -# Output: X -cuda_major_version=$(nvcc --version | sed -n 's/^.*release \([0-9]*\.[0-9]*\).*$/\1/p' | cut -d. -f1) - -# Find latest cuDNN version compatible with existing CUDA by matching -# ${cuda_major_version} in the package version string -# In most cases cuDNN release is behind CUDA ones. It is considered, that major -# version of CUDA and cuDNN are compatible. -# For example, CUDA 12.3 + cuDNN 8.9.6 (libcudnn8 version: 8.9.6.50-1+cuda12.2) is -# considered to be compatible. -if [[ ${CUDNN_MAJOR_VERSION} -le 8 ]]; then - libcudnn_name=libcudnn${CUDNN_MAJOR_VERSION} - libcudnn_dev_name=libcudnn${CUDNN_MAJOR_VERSION}-dev - version_pattern="s/^Version: \(.*+cuda${cuda_major_version}\.[0-9]*\)$/\1/p" -elif [[ ${CUDNN_MAJOR_VERSION} -eq 9 ]]; then - libcudnn_name=libcudnn${CUDNN_MAJOR_VERSION}-cuda-${cuda_major_version} - libcudnn_dev_name=libcudnn${CUDNN_MAJOR_VERSION}-dev-cuda-${cuda_major_version} - version_pattern="s/^Version: \(${CUDNN_MAJOR_VERSION}\.[0-9.-]*\)$/\1/p" -fi -libcudnn_version=$(apt-cache show $libcudnn_name | sed -n "$version_pattern" | head -n 1) -libcudnn_dev_version=$(apt-cache show $libcudnn_dev_name | sed -n "$version_pattern" | head -n 1) -if [[ -z "${libcudnn_version}" || -z "${libcudnn_dev_version}" ]]; then - echo "Could not find compatible cuDNN version for CUDA ${cuda_version}" - exit 1 -fi - -apt-get install -y \ - ${libcudnn_name}=${libcudnn_version} \ - ${libcudnn_dev_name}=${libcudnn_dev_version} -apt-get clean -rm -rf /var/lib/apt/lists/* - -# Create a prefix with include/ and lib/ directories containing symlinks to the cuDNN -# version that was just installed; this is useful to pass to XLA to avoid it fetching -# its own copy of cuDNN. -prefix=/opt/nvidia/cudnn -if [[ -d "${prefix}" ]]; then - echo "Skipping link farm creation" - exit 1 -fi -arch=$(uname -m)-linux-gnu -for cudnn_file in $(dpkg -L ${libcudnn_name} ${libcudnn_dev_name} | sort -u); do - # Real files and symlinks are linked into $prefix - if [[ -f "${cudnn_file}" || -h "${cudnn_file}" ]]; then - # Replace /usr with $prefix - nosysprefix="${cudnn_file#"/usr/"}" - # include/x86_64-linux-gpu -> include/ - noarchinclude="${nosysprefix/#"include/${arch}"/include}" - # cudnn_v9.h -> cudnn.h - noverheader="${noarchinclude/%"_v${CUDNN_MAJOR_VERSION}.h"/.h}" - # lib/x86_64-linux-gnu -> lib/ - noarchlib="${noverheader/#"lib/${arch}"/lib}" - link_name="${prefix}/${noarchlib}" - link_dir=$(dirname "${link_name}") - mkdir -p "${link_dir}" - ln -s "${cudnn_file}" "${link_name}" - else - echo "Skipping ${cudnn_file}" - fi -done diff --git a/.github/container/install-efa.sh b/.github/container/install-efa.sh deleted file mode 100755 index e1001b639..000000000 --- a/.github/container/install-efa.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -set -ex - -EFA_INSTALLER_VERSION=1.34.0 # or: latest -AWS_OFI_NCCL_PREFIX=/opt/aws-ofi-nccl -AWS_OFI_NCCL_VERSION=1.11.0 - -apt update - -EFA_TMP=$(mktemp -d) -pushd $EFA_TMP -curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz -tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz -cd aws-efa-installer -rm -v DEBS/UBUNTU2204/x86_64/{libpmix,openmpi,prrte}* # block installation of MPI components -apt-get purge -y ibverbs-providers libibverbs-dev libibverbs1 libibumad-dev libibumad3 librdmacm1 librdmacm-dev ibverbs-utils -./efa_installer.sh -g -y --skip-kmod --skip-limit-conf --no-verify |& tee install.log -mv -v install.log /opt/amazon/efa/install.log -popd -rm -rf $EFA_TMP - -AWS_OFI_NCCL_TMP=$(mktemp -d) -pushd $AWS_OFI_NCCL_TMP -apt-get install -y libhwloc-dev -curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}-aws/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws.tar.gz -tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws.tar.gz -cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}-aws -./configure --prefix=${AWS_OFI_NCCL_PREFIX} --with-libfabric=/opt/amazon/efa --with-cuda=/usr/local/cuda --with-mpi=/usr/local/mpi -make -j$(nproc) install -popd -rm -rf $AWS_OFI_NCCL_TMP - -rm -rf /var/lib/apt/lists/* - -# Ranks higher than HPC-X => newly-installed libnccl-net.so becomes the default -echo "${AWS_OFI_NCCL_PREFIX}/lib" > /etc/ld.so.conf.d/000_aws_ofi_nccl.conf -ldconfig diff --git a/.github/container/install-nccl.sh b/.github/container/install-nccl.sh deleted file mode 100755 index 892bf2d4d..000000000 --- a/.github/container/install-nccl.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash - -set -ex -o pipefail - -export DEBIAN_FRONTEND=noninteractive -export TZ=America/Los_Angeles - -# If NCCL is already installed, don't reinstall it. Print a message and exit -if dpkg -s libnccl2 libnccl-dev &> /dev/null; then - echo "NCCL is already installed. Skipping installation." -else - apt-get update - - # Extract CUDA version from `nvcc --version` output line - # Input: "Cuda compilation tools, release X.Y, VX.Y.Z" - # Output: X.Y - cuda_version=$(nvcc --version | sed -n 's/^.*release \([0-9]*\.[0-9]*\).*$/\1/p') - - # Find latest NCCL version compatible with existing CUDA by matching - # ${cuda_version} in the package version string - libnccl2_version=$(apt-cache show libnccl-dev | sed -n "s/^Version: \(.*+cuda${cuda_version}\)$/\1/p" | head -n 1) - libnccl_dev_version=$(apt-cache show libnccl-dev | sed -n "s/^Version: \(.*+cuda${cuda_version}\)$/\1/p" | head -n 1) - if [[ -z "${libnccl2_version}" || -z "${libnccl_dev_version}" ]]; then - echo "Could not find compatible NCCL version for CUDA ${cuda_version}" - exit 1 - fi - - apt-get install -y \ - libnccl2=${libnccl2_version} \ - libnccl-dev=${libnccl_dev_version} - - apt-get clean - rm -rf /var/lib/apt/lists/* -fi - -# Create a prefix with include/ and lib/ directories containing symlinks to the NCCL -# version installed at the system level; this is useful to pass to XLA to avoid it -# fetching its own copy. -prefix=/opt/nvidia/nccl -if [[ -d "${prefix}" ]]; then - echo "Skipping link farm creation" - exit 1 -fi -arch=$(uname -m)-linux-gnu -for nccl_file in $(dpkg -L libnccl2 libnccl-dev | sort -u); do - # Real files and symlinks are linked into $prefix - if [[ -f "${nccl_file}" || -h "${nccl_file}" ]]; then - # Replace /usr with $prefix and remove arch-specific lib directories - nosysprefix="${nccl_file#"/usr/"}" - noarchlib="${nosysprefix/#"lib/${arch}"/lib}" - link_name="${prefix}/${noarchlib}" - link_dir=$(dirname "${link_name}") - mkdir -p "${link_dir}" - ln -s "${nccl_file}" "${link_name}" - else - echo "Skipping ${nccl_file}" - fi -done diff --git a/.github/container/install-nsight.sh b/.github/container/install-nsight.sh deleted file mode 100755 index dc0ef92cb..000000000 --- a/.github/container/install-nsight.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -set -ex -o pipefail - -# Repo for newer nsight versions -UBUNTU_ARCH=$(dpkg --print-architecture) -UBUNTU_VERSION=$(. /etc/os-release && echo ${ID}${VERSION_ID/./}) # e.g. ubuntu2204 -DEVTOOLS_URL=https://developer.download.nvidia.com/devtools/repos/${UBUNTU_VERSION}/${UBUNTU_ARCH} -curl -o /usr/share/keyrings/nvidia.pub "${DEVTOOLS_URL}/nvidia.pub" -echo "deb [signed-by=/usr/share/keyrings/nvidia.pub] ${DEVTOOLS_URL}/ /" > /etc/apt/sources.list.d/devtools-${UBUNTU_VERSION}-${UBUNTU_ARCH}.list - -export DEBIAN_FRONTEND=noninteractive -export TZ=America/Los_Angeles - -apt-get update -apt-get install -y nsight-compute nsight-systems-cli-2024.6.1 -apt-get clean - -rm -rf /var/lib/apt/lists/* diff --git a/.github/container/install-ofed.sh b/.github/container/install-ofed.sh deleted file mode 100755 index d6c678328..000000000 --- a/.github/container/install-ofed.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -set -ex - -export DEBIAN_FRONTEND=noninteractive -export TZ=America/Los_Angeles - -# Install libnl (Netlink Protocol Library Suite), which provides a low-level API -# for communication between kernel and user space processes in Linux. Essential for managing -# networking components such as routing tables, network interfaces, and address resolution. - -apt-get update -apt-get install -y \ - curl \ - libnl-route-3-200 \ - libnl-3-dev \ - libnl-route-3-dev - -# Download NVIDIA/Mellanox's OFED distribution and install - -WORKDIR=$(mktemp -d) -pushd ${WORKDIR} - -MLNX_OFED_LINK="https://content.mellanox.com/ofed/MLNX_OFED-23.04-1.1.3.0/MLNX_OFED_LINUX-23.04-1.1.3.0-ubuntu22.04-$(uname -i).tgz" -curl -s -L "${MLNX_OFED_LINK}" -o - | tar xz --no-anchored --wildcards 'DEBS/*' --strip-components=3 - -dpkg -i libibverbs1_*.deb \ - libibverbs-dev_*.deb \ - librdmacm1_*.deb \ - librdmacm-dev_*.deb \ - libibumad3_*.deb \ - libibumad-dev_*.deb \ - ibverbs-utils_*.deb \ - ibverbs-providers_*.deb - -popd - -# cleanup - -apt-get clean -rm -rf /var/lib/apt/lists/* -rm -rf ${WORKDIR} diff --git a/.github/container/symlnk-cudnn.sh b/.github/container/symlnk-cudnn.sh new file mode 100755 index 000000000..5db2c411f --- /dev/null +++ b/.github/container/symlnk-cudnn.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +set -ex + +CUDNN_MAJOR_VERSION=9 + +# Create a prefix with include/ and lib/ directories containing symlinks to the cuDNN +# version that was just installed; this is useful to pass to XLA to avoid it fetching +# its own copy of cuDNN. +prefix=/opt/nvidia/cudnn +if [[ -d "${prefix}" ]]; then + echo "Skipping link farm creation" + exit 1 +fi + +arch=$(uname -m)-linux-gnu +libcudnn_pkgs=$(dpkg -l 'libcudnn*' | awk '/^ii/ {print $2}') +if [[ -z "${libcudnn_pkgs}" ]]; then + echo "No libcudnn packages installed." + exit 1 +fi + +for cudnn_file in $(dpkg -L ${libcudnn_pkgs} | sort -u); do + # Real files and symlinks are linked into $prefix + if [[ -f "${cudnn_file}" || -h "${cudnn_file}" ]]; then + # Replace /usr with $prefix + nosysprefix="${cudnn_file#"/usr/"}" + # include/x86_64-linux-gpu -> include/ + noarchinclude="${nosysprefix/#"include/${arch}"/include}" + # cudnn_v9.h -> cudnn.h + noverheader="${noarchinclude/%"_v${CUDNN_MAJOR_VERSION}.h"/.h}" + # lib/x86_64-linux-gnu -> lib/ + noarchlib="${noverheader/#"lib/${arch}"/lib}" + link_name="${prefix}/${noarchlib}" + link_dir=$(dirname "${link_name}") + mkdir -p "${link_dir}" + ln -s "${cudnn_file}" "${link_name}" + else + echo "Skipping ${cudnn_file}" + fi +done \ No newline at end of file diff --git a/.github/container/symlnk-nccl.sh b/.github/container/symlnk-nccl.sh new file mode 100755 index 000000000..33b4ebaa9 --- /dev/null +++ b/.github/container/symlnk-nccl.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -ex -o pipefail + +# Create a prefix with include/ and lib/ directories containing symlinks to the NCCL +# version installed at the system level; this is useful to pass to XLA to avoid it +# fetching its own copy. +prefix=/opt/nvidia/nccl +if [[ -d "${prefix}" ]]; then + echo "Skipping link farm creation" + exit 1 +fi +arch=$(uname -m)-linux-gnu +nccl_packages=$(dpkg -l 'libnccl*' | awk '/^ii/ {print $2}') + +if [[ -z "${nccl_packages}" ]]; then + echo "No NCCL packages installed." + exit 1 +fi + +for nccl_file in $(dpkg -L ${nccl_packages} | sort -u); do + # Real files and symlinks are linked into $prefix + if [[ -f "${nccl_file}" || -h "${nccl_file}" ]]; then + # Replace /usr with $prefix and remove arch-specific lib directories + nosysprefix="${nccl_file#"/usr/"}" + noarchlib="${nosysprefix/#"lib/${arch}"/lib}" + link_name="${prefix}/${noarchlib}" + link_dir=$(dirname "${link_name}") + mkdir -p "${link_dir}" + ln -s "${nccl_file}" "${link_name}" + else + echo "Skipping ${nccl_file}" + fi +done diff --git a/.github/container/test-aws-efa.sh b/.github/container/test-aws-efa.sh deleted file mode 100644 index 21f921a76..000000000 --- a/.github/container/test-aws-efa.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -# 1. Check if AWS EFA installation script completed successfully -check=$(/opt/amazon/efa/bin/fi_info --version | grep "libfabric") -if [[ -z "$check" ]]; then - echo "Fail to install AWS EFA" - exit 1 -fi - -echo "AWS EFA installed successfully" - -exit 0 \ No newline at end of file