Skip to content

Commit

Permalink
CUDA 12.6 support, pt 2 (#3533)
Browse files Browse the repository at this point in the history
Summary:
X-link: facebookresearch/FBGEMM#615

Pull Request resolved: #3533

Reviewed By: spcyppt

Differential Revision: D67725264

Pulled By: q10

fbshipit-source-id: 3f7206f47781f0d4916a808017743639f8c1e5af
  • Loading branch information
q10 authored and facebook-github-bot committed Jan 1, 2025
1 parent 55b59b7 commit 213d849
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 39 deletions.
5 changes: 5 additions & 0 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ __configure_fbgemm_gpu_build_nvcc () {
echo "[BUILD] Setting NVCC flags ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} NVCC_PREPEND_FLAGS=\"${nvcc_prepend_flags}\"
# shellcheck disable=SC2086
print_exec conda run ${env_prefix} printenv NVCC_PREPEND_FLAGS

echo "[BUILD] Setting CUDA build args ..."
# shellcheck disable=SC2206
Expand Down Expand Up @@ -302,6 +304,9 @@ __configure_fbgemm_gpu_build () {
__configure_fbgemm_gpu_build_cuda "${fbgemm_variant_targets}"
fi

# shellcheck disable=SC2086
print_exec conda run ${env_prefix} c++ --version

# Set other compiler flags as needed
if print_exec "conda run ${env_prefix} c++ --version | grep -i clang"; then
__configure_fbgemm_gpu_build_clang
Expand Down
33 changes: 33 additions & 0 deletions .github/scripts/utils_base.bash
Original file line number Diff line number Diff line change
Expand Up @@ -265,3 +265,36 @@ test_library_symbol () {
return 1
fi
}

set_clang_symlinks () {
local env_name="$1"
if [ "$env_name" == "" ]; then
echo "Usage: ${FUNCNAME[0]} ENV_NAME"
echo "Example(s):"
echo " ${FUNCNAME[0]} build_env"
return 1
fi

# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# shellcheck disable=SC2155,SC2086
local cc_path=$(conda run ${env_prefix} which clang)
# shellcheck disable=SC2155,SC2086
local cxx_path=$(conda run ${env_prefix} which clang++)

# Set the symlinks, override if needed
#
# NOTE: Setting the symlink CONDA_PREFIX/bin/c++ to point to clang++ can mess
# up the runtime for tests, since torch dynamo makes compilation calls with
# gcc-specific compiler flags, effectively making gcc a hard dependency:
#
# clang-16: error: unknown argument: '-fno-tree-loop-vectorize'
#
# As such, clang is installed only during the build step, where we are
# exercising building FBGEMM in clang.
print_exec ln -sf "${cc_path}" "$(dirname "$cc_path")/cc"
print_exec ln -sf "${cc_path}" "$(dirname "$cc_path")/gcc"
print_exec ln -sf "${cxx_path}" "$(dirname "$cxx_path")/c++"
print_exec ln -sf "${cxx_path}" "$(dirname "$cxx_path")/g++"
}
23 changes: 3 additions & 20 deletions .github/scripts/utils_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ __remove_gcc_activation_scripts () {
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
echo "[INSTALL] Removing GCC package activation scripts ..."
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
print_exec ls -la ${conda_prefix}/etc/conda/activate.d
print_exec rm -rf ${conda_prefix}/etc/conda/activate.d/activate-gcc_linux-${COMPILER_ARCHNAME}.sh
print_exec rm -rf ${conda_prefix}/etc/conda/activate.d/activate-gxx_linux-${COMPILER_ARCHNAME}.sh
fi
Expand All @@ -208,25 +209,7 @@ __conda_install_clang () {
# The compilers are visible in the PATH as `clang` and `clang++`, so symlinks
# will need to be created
echo "[INSTALL] Setting the C/C++ compiler symlinks ..."
# shellcheck disable=SC2155,SC2086
local cc_path=$(conda run ${env_prefix} which clang)
# shellcheck disable=SC2155,SC2086
local cxx_path=$(conda run ${env_prefix} which clang++)

# Set the symlinks, override if needed
#
# NOTE: Setting the symlink CONDA_PREFIX/bin/c++ to point to clang++ can mess
# up the runtime for tests, since torch dynamo makes compilation calls with
# gcc-specific compiler flags, effectively making gcc a hard dependency:
#
# clang-16: error: unknown argument: '-fno-tree-loop-vectorize'
#
# As such, clang is installed only during the build step, where we are
# exercising building FBGEMM in clang.
print_exec ln -sf "${cc_path}" "$(dirname "$cc_path")/cc"
print_exec ln -sf "${cc_path}" "$(dirname "$cc_path")/gcc"
print_exec ln -sf "${cxx_path}" "$(dirname "$cxx_path")/c++"
print_exec ln -sf "${cxx_path}" "$(dirname "$cxx_path")/g++"
set_clang_symlinks "${env_name}"

# Remove the Conda activations scripts for gcc; see comments in the method for details
__remove_gcc_activation_scripts
Expand Down Expand Up @@ -369,7 +352,7 @@ install_build_tools () {
scikit-build \
wheel) || return 1

echo "[INSTALL] Adding symlink librhash.so.0, which is needed by Cmake ..."
echo "[INSTALL] Adding symlink librhash.so.0, which is needed by CMake ..."
# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)
(print_exec ln -s "${conda_prefix}/lib/librhash.so" "${conda_prefix}/lib/librhash.so.0") || return 1
Expand Down
76 changes: 59 additions & 17 deletions .github/scripts/utils_cuda.bash
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,54 @@ __set_cuda_symlinks_envvars () {

echo "[INSTALL] Setting environment variable NVML_LIB_PATH ..."
# shellcheck disable=SC2155
local nvml_lib_path=$(find "${conda_prefix}" -name libnvidia-ml.so | head -n1)
local libnvml_path=$(find "${conda_prefix}" -name libnvidia-ml.so | head -n1)
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} NVML_LIB_PATH="${nvml_lib_path}"
print_exec conda env config vars set ${env_prefix} NVML_LIB_PATH="${libnvml_path}"

if [ "$ADD_LIBCUDA_SYMLINK" == "1" ]; then
echo "[INSTALL] Setting up symlink to libnvidia-ml.so.1"
print_exec ln "${libnvml_path}" -s "${conda_prefix}/lib/libnvidia-ml.so.1"
fi

echo "[INSTALL] Setting environment variable CUDA_INCLUDE_DIRS ..."
# shellcheck disable=SC2086
print_exec conda env config vars set ${env_prefix} CUDA_INCLUDE_DIRS=\""${conda_prefix}/include/:${new_cuda_home}/include/"\"

# Ensure that the CUDA headers are properly installed
(test_filepath "${env_name}" cuda_runtime.h) || return 1
# Ensure that the libraries are properly installed
(test_filepath "${env_name}" libcuda.so) || return 1
(test_filepath "${env_name}" libnvToolsExt.so) || return 1
(test_filepath "${env_name}" libnvidia-ml.so) || return 1

# Ensure that nvcc is properly installed
(test_binpath "${env_name}" nvcc) || return 1
}

__set_nvcc_prepend_flags () {
# shellcheck disable=SC2155,SC2086
local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX)

# If clang is available, but CUDA was installed through conda-forge, the
# cc/c++ symlinks will be reset to gcc/g++, so fix this first
# shellcheck disable=SC2155,SC2086
if conda run ${env_prefix} clang --version; then
echo "[INSTALL] Resetting compiler symlinks to clang ..."
set_clang_symlinks "${env_name}"
fi

# The NVCC activation scripts append `-ccbin=${CXX}`` to NVCC_PREPEND_FLAGS,
# which overrides whatever `-ccbin` flag we set manually, so remove this
# unwanted hook
print_exec ls -la "${conda_prefix}/etc/conda/activate.d"
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
echo "[INSTALL] Removing the -ccbin=CXX hook from NVCC activation scripts ..."
print_exec sed -i '/-ccbin=/d' "${conda_prefix}/etc/conda/activate.d/*cuda-nvcc_activate.sh"
fi

local nvcc_prepend_flags=(
# Allow for the use of newer compilers than what the current CUDA SDK supports
# Allow for the use of newer compilers than what the current CUDA SDK
# supports
-allow-unsupported-compiler
)

Expand Down Expand Up @@ -144,25 +180,31 @@ install_cuda () {

# shellcheck disable=SC2155
local env_prefix=$(env_name_or_prefix "${env_name}")

# Install CUDA packages
echo "[INSTALL] Installing CUDA ${cuda_version} ..."
# shellcheck disable=SC2086
(exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c "nvidia/label/cuda-${cuda_version}" -y \
cuda) || return 1

# NOTE: Currently, CUDA 12.6 cannot be installed using the nvidia/label/cuda-*
# conda channels, because we run into the following error:
#
# LibMambaUnsatisfiableError: Encountered problems while solving:
# - nothing provides __win needed by cuda-12.6.3-0
#
# For now, we only use conda-forge for installing 12.6, but it is likely that
# in the future, we will be using conda-forge for installing all CUDA versions
# (except for versions 11.8 and below, which are only available through
# nvidia/label/cuda-*)
if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then
# shellcheck disable=SC2086
(exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \
cuda=${cuda_version}) || return 1
else
# shellcheck disable=SC2086
(exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c "nvidia/label/cuda-${cuda_version}" -y \
cuda) || return 1
fi

# Set the symlinks and environment variables not covered by conda install
__set_cuda_symlinks_envvars

# Ensure that nvcc is properly installed
(test_binpath "${env_name}" nvcc) || return 1
# Ensure that the CUDA headers are properly installed
(test_filepath "${env_name}" cuda_runtime.h) || return 1
# Ensure that the libraries are properly installed
(test_filepath "${env_name}" libcuda.so) || return 1
(test_filepath "${env_name}" libnvToolsExt.so) || return 1
(test_filepath "${env_name}" libnvidia-ml.so) || return 1

# Set the NVCC prepend flags depending on gcc or clang
__set_nvcc_prepend_flags

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
{ arch: x86, instance: "ubuntu-latest" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
compiler: [ "gcc", "clang" ]

steps:
Expand Down Expand Up @@ -149,7 +149,7 @@ jobs:
{ arch: x86, instance: "ubuntu-latest" },
]
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
cuda-version: [ "11.8.0", "12.4.1" ]
cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ]
compiler: [ "gcc", "clang" ]
needs: build_artifact

Expand Down

0 comments on commit 213d849

Please sign in to comment.