From 213d849a4987ba4edb028a3ef4a4acfa469bc55c Mon Sep 17 00:00:00 2001 From: Benson Ma Date: Wed, 1 Jan 2025 00:12:24 -0800 Subject: [PATCH] CUDA 12.6 support, pt 2 (#3533) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/615 Pull Request resolved: https://github.com/pytorch/FBGEMM/pull/3533 Reviewed By: spcyppt Differential Revision: D67725264 Pulled By: q10 fbshipit-source-id: 3f7206f47781f0d4916a808017743639f8c1e5af --- .github/scripts/fbgemm_gpu_build.bash | 5 ++ .github/scripts/utils_base.bash | 33 ++++++++ .github/scripts/utils_build.bash | 23 +----- .github/scripts/utils_cuda.bash | 76 ++++++++++++++----- .../fbgemm_gpu_ci_genai_generic_infra.yml | 4 +- 5 files changed, 102 insertions(+), 39 deletions(-) diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash index 2ba89eef50..5d81ab999c 100644 --- a/.github/scripts/fbgemm_gpu_build.bash +++ b/.github/scripts/fbgemm_gpu_build.bash @@ -106,6 +106,8 @@ __configure_fbgemm_gpu_build_nvcc () { echo "[BUILD] Setting NVCC flags ..." # shellcheck disable=SC2086 print_exec conda env config vars set ${env_prefix} NVCC_PREPEND_FLAGS=\"${nvcc_prepend_flags}\" + # shellcheck disable=SC2086 + print_exec conda run ${env_prefix} printenv NVCC_PREPEND_FLAGS echo "[BUILD] Setting CUDA build args ..." # shellcheck disable=SC2206 @@ -302,6 +304,9 @@ __configure_fbgemm_gpu_build () { __configure_fbgemm_gpu_build_cuda "${fbgemm_variant_targets}" fi + # shellcheck disable=SC2086 + print_exec conda run ${env_prefix} c++ --version + # Set other compiler flags as needed if print_exec "conda run ${env_prefix} c++ --version | grep -i clang"; then __configure_fbgemm_gpu_build_clang diff --git a/.github/scripts/utils_base.bash b/.github/scripts/utils_base.bash index 39a88f7a69..976befa4e2 100644 --- a/.github/scripts/utils_base.bash +++ b/.github/scripts/utils_base.bash @@ -265,3 +265,36 @@ test_library_symbol () { return 1 fi } + +set_clang_symlinks () { + local env_name="$1" + if [ "$env_name" == "" ]; then + echo "Usage: ${FUNCNAME[0]} ENV_NAME" + echo "Example(s):" + echo " ${FUNCNAME[0]} build_env" + return 1 + fi + + # shellcheck disable=SC2155 + local env_prefix=$(env_name_or_prefix "${env_name}") + + # shellcheck disable=SC2155,SC2086 + local cc_path=$(conda run ${env_prefix} which clang) + # shellcheck disable=SC2155,SC2086 + local cxx_path=$(conda run ${env_prefix} which clang++) + + # Set the symlinks, override if needed + # + # NOTE: Setting the symlink CONDA_PREFIX/bin/c++ to point to clang++ can mess + # up the runtime for tests, since torch dynamo makes compilation calls with + # gcc-specific compiler flags, effectively making gcc a hard dependency: + # + # clang-16: error: unknown argument: '-fno-tree-loop-vectorize' + # + # As such, clang is installed only during the build step, where we are + # exercising building FBGEMM in clang. + print_exec ln -sf "${cc_path}" "$(dirname "$cc_path")/cc" + print_exec ln -sf "${cc_path}" "$(dirname "$cc_path")/gcc" + print_exec ln -sf "${cxx_path}" "$(dirname "$cxx_path")/c++" + print_exec ln -sf "${cxx_path}" "$(dirname "$cxx_path")/g++" +} diff --git a/.github/scripts/utils_build.bash b/.github/scripts/utils_build.bash index 7fded1eb2e..9fc88e449e 100644 --- a/.github/scripts/utils_build.bash +++ b/.github/scripts/utils_build.bash @@ -182,6 +182,7 @@ __remove_gcc_activation_scripts () { if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then echo "[INSTALL] Removing GCC package activation scripts ..." local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX) + print_exec ls -la ${conda_prefix}/etc/conda/activate.d print_exec rm -rf ${conda_prefix}/etc/conda/activate.d/activate-gcc_linux-${COMPILER_ARCHNAME}.sh print_exec rm -rf ${conda_prefix}/etc/conda/activate.d/activate-gxx_linux-${COMPILER_ARCHNAME}.sh fi @@ -208,25 +209,7 @@ __conda_install_clang () { # The compilers are visible in the PATH as `clang` and `clang++`, so symlinks # will need to be created echo "[INSTALL] Setting the C/C++ compiler symlinks ..." - # shellcheck disable=SC2155,SC2086 - local cc_path=$(conda run ${env_prefix} which clang) - # shellcheck disable=SC2155,SC2086 - local cxx_path=$(conda run ${env_prefix} which clang++) - - # Set the symlinks, override if needed - # - # NOTE: Setting the symlink CONDA_PREFIX/bin/c++ to point to clang++ can mess - # up the runtime for tests, since torch dynamo makes compilation calls with - # gcc-specific compiler flags, effectively making gcc a hard dependency: - # - # clang-16: error: unknown argument: '-fno-tree-loop-vectorize' - # - # As such, clang is installed only during the build step, where we are - # exercising building FBGEMM in clang. - print_exec ln -sf "${cc_path}" "$(dirname "$cc_path")/cc" - print_exec ln -sf "${cc_path}" "$(dirname "$cc_path")/gcc" - print_exec ln -sf "${cxx_path}" "$(dirname "$cxx_path")/c++" - print_exec ln -sf "${cxx_path}" "$(dirname "$cxx_path")/g++" + set_clang_symlinks "${env_name}" # Remove the Conda activations scripts for gcc; see comments in the method for details __remove_gcc_activation_scripts @@ -369,7 +352,7 @@ install_build_tools () { scikit-build \ wheel) || return 1 - echo "[INSTALL] Adding symlink librhash.so.0, which is needed by Cmake ..." + echo "[INSTALL] Adding symlink librhash.so.0, which is needed by CMake ..." # shellcheck disable=SC2155,SC2086 local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX) (print_exec ln -s "${conda_prefix}/lib/librhash.so" "${conda_prefix}/lib/librhash.so.0") || return 1 diff --git a/.github/scripts/utils_cuda.bash b/.github/scripts/utils_cuda.bash index 81bd6e3332..40f906aadb 100644 --- a/.github/scripts/utils_cuda.bash +++ b/.github/scripts/utils_cuda.bash @@ -49,18 +49,54 @@ __set_cuda_symlinks_envvars () { echo "[INSTALL] Setting environment variable NVML_LIB_PATH ..." # shellcheck disable=SC2155 - local nvml_lib_path=$(find "${conda_prefix}" -name libnvidia-ml.so | head -n1) + local libnvml_path=$(find "${conda_prefix}" -name libnvidia-ml.so | head -n1) # shellcheck disable=SC2086 - print_exec conda env config vars set ${env_prefix} NVML_LIB_PATH="${nvml_lib_path}" + print_exec conda env config vars set ${env_prefix} NVML_LIB_PATH="${libnvml_path}" + + if [ "$ADD_LIBCUDA_SYMLINK" == "1" ]; then + echo "[INSTALL] Setting up symlink to libnvidia-ml.so.1" + print_exec ln "${libnvml_path}" -s "${conda_prefix}/lib/libnvidia-ml.so.1" + fi echo "[INSTALL] Setting environment variable CUDA_INCLUDE_DIRS ..." # shellcheck disable=SC2086 print_exec conda env config vars set ${env_prefix} CUDA_INCLUDE_DIRS=\""${conda_prefix}/include/:${new_cuda_home}/include/"\" + + # Ensure that the CUDA headers are properly installed + (test_filepath "${env_name}" cuda_runtime.h) || return 1 + # Ensure that the libraries are properly installed + (test_filepath "${env_name}" libcuda.so) || return 1 + (test_filepath "${env_name}" libnvToolsExt.so) || return 1 + (test_filepath "${env_name}" libnvidia-ml.so) || return 1 + + # Ensure that nvcc is properly installed + (test_binpath "${env_name}" nvcc) || return 1 } __set_nvcc_prepend_flags () { + # shellcheck disable=SC2155,SC2086 + local conda_prefix=$(conda run ${env_prefix} printenv CONDA_PREFIX) + + # If clang is available, but CUDA was installed through conda-forge, the + # cc/c++ symlinks will be reset to gcc/g++, so fix this first + # shellcheck disable=SC2155,SC2086 + if conda run ${env_prefix} clang --version; then + echo "[INSTALL] Resetting compiler symlinks to clang ..." + set_clang_symlinks "${env_name}" + fi + + # The NVCC activation scripts append `-ccbin=${CXX}`` to NVCC_PREPEND_FLAGS, + # which overrides whatever `-ccbin` flag we set manually, so remove this + # unwanted hook + print_exec ls -la "${conda_prefix}/etc/conda/activate.d" + if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then + echo "[INSTALL] Removing the -ccbin=CXX hook from NVCC activation scripts ..." + print_exec sed -i '/-ccbin=/d' "${conda_prefix}/etc/conda/activate.d/*cuda-nvcc_activate.sh" + fi + local nvcc_prepend_flags=( - # Allow for the use of newer compilers than what the current CUDA SDK supports + # Allow for the use of newer compilers than what the current CUDA SDK + # supports -allow-unsupported-compiler ) @@ -144,25 +180,31 @@ install_cuda () { # shellcheck disable=SC2155 local env_prefix=$(env_name_or_prefix "${env_name}") - - # Install CUDA packages echo "[INSTALL] Installing CUDA ${cuda_version} ..." - # shellcheck disable=SC2086 - (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c "nvidia/label/cuda-${cuda_version}" -y \ - cuda) || return 1 + + # NOTE: Currently, CUDA 12.6 cannot be installed using the nvidia/label/cuda-* + # conda channels, because we run into the following error: + # + # LibMambaUnsatisfiableError: Encountered problems while solving: + # - nothing provides __win needed by cuda-12.6.3-0 + # + # For now, we only use conda-forge for installing 12.6, but it is likely that + # in the future, we will be using conda-forge for installing all CUDA versions + # (except for versions 11.8 and below, which are only available through + # nvidia/label/cuda-*) + if [[ "$BUILD_CUDA_VERSION" =~ ^12.6.*$ ]]; then + # shellcheck disable=SC2086 + (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c conda-forge --override-channels -y \ + cuda=${cuda_version}) || return 1 + else + # shellcheck disable=SC2086 + (exec_with_retries 3 conda install --force-reinstall ${env_prefix} -c "nvidia/label/cuda-${cuda_version}" -y \ + cuda) || return 1 + fi # Set the symlinks and environment variables not covered by conda install __set_cuda_symlinks_envvars - # Ensure that nvcc is properly installed - (test_binpath "${env_name}" nvcc) || return 1 - # Ensure that the CUDA headers are properly installed - (test_filepath "${env_name}" cuda_runtime.h) || return 1 - # Ensure that the libraries are properly installed - (test_filepath "${env_name}" libcuda.so) || return 1 - (test_filepath "${env_name}" libnvToolsExt.so) || return 1 - (test_filepath "${env_name}" libnvidia-ml.so) || return 1 - # Set the NVCC prepend flags depending on gcc or clang __set_nvcc_prepend_flags diff --git a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml index d23b53c72d..d028d23474 100644 --- a/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml +++ b/.github/workflows/fbgemm_gpu_ci_genai_generic_infra.yml @@ -61,7 +61,7 @@ jobs: { arch: x86, instance: "ubuntu-latest" }, ] python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] - cuda-version: [ "11.8.0", "12.4.1" ] + cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ] compiler: [ "gcc", "clang" ] steps: @@ -149,7 +149,7 @@ jobs: { arch: x86, instance: "ubuntu-latest" }, ] python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] - cuda-version: [ "11.8.0", "12.4.1" ] + cuda-version: [ "11.8.0", "12.4.1", "12.6.3" ] compiler: [ "gcc", "clang" ] needs: build_artifact