diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e1ca9b..f732927 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,10 @@ endif() set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake") +set(USER_CUDA_ARCH_LIST + "" + CACHE STRING "User-specified cuda device architectures") + include(generic) add_subdirectory(csrc) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 7236d5b..ffe0c7a 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -39,14 +39,26 @@ find_package(Torch REQUIRED) message(STATUS "Torch include include_directories: " ${TORCH_INCLUDE_DIRS}) include_directories(${TORCH_INCLUDE_DIRS}) -# let cmake automatically detect the current CUDA architecture to avoid -# generating device codes for all possible architectures -set(CMAKE_CUDA_ARCHITECTURES OFF) +if(USER_CUDA_ARCH_LIST) + message(STATUS "User specified CUDA architectures: ${USER_CUDA_ARCH_LIST}") + set(CMAKE_CUDA_ARCHITECTURES ${USER_CUDA_ARCH_LIST}) +else() + # let cmake automatically detect the current CUDA architecture to avoid + # generating device codes for all possible architectures + message(STATUS "No user specified CUDA architectures, cmake will detect the " + "current CUDA architecture.") + set(CMAKE_CUDA_ARCHITECTURES ON) +endif() + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror all-warnings") + # Set the CUDA_PROPAGATE_HOST_FLAGS to OFF to avoid passing host compiler flags # to the device compiler set(CUDA_PROPAGATE_HOST_FLAGS OFF) +message(STATUS "my cuda architecture list: ${CUDA_ARCH_LIST}") +message(STATUS "cmake cuda flags: ${CMAKE_CUDA_FLAGS}") + # FIXME(ying): -std=c++17 has to be set explicitly here, Otherwise, linking # against torchlibs will raise errors. it seems that the host compilation # options are not passed to torchlibs. @@ -64,6 +76,8 @@ set(CUDA_NVCC_FLAGS -U__CUDA_NO_BFLOAT162_CONVERSIONS__) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math) +message(STATUS "NVCC FLAGS = ${CUDA_NVCC_FLAGS}") + if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11") add_definitions("-DENABLE_BF16") message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} " diff --git a/setup.py b/setup.py index dadfe13..7d39dcd 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- - import os import subprocess from pathlib import Path @@ -56,6 +55,10 @@ def build_extension(self, ext: CMakeExtension) -> None: ) if self.debug is None else self.debug cfg = "Debug" if debug else "Release" + # Set CUDA_ARCH_LIST to build the dynamic shared library + # for the specified GPU architectures. + arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None) + parallel_level = os.environ.get("CMAKE_BUILD_PARALLEL_LEVEL", None) if parallel_level is not None: self.parallel = int(parallel_level) @@ -72,9 +75,11 @@ def build_extension(self, ext: CMakeExtension) -> None: "-DCMAKE_BUILD_TYPE=%s" % cfg, "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format( cfg.upper(), extdir - ), "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}".format( + ), + "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}".format( cfg.upper(), self.build_temp - ) + ), + "-DCUDA_ARCH_LIST={}".format(arch_list) if arch_list else "", ] # Adding CMake arguments set as environment variable