diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e1ca9b..f732927 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,10 @@ endif()
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake")
 
+set(USER_CUDA_ARCH_LIST
+    ""
+    CACHE STRING "User-specified cuda device architectures")
+
 include(generic)
 
 add_subdirectory(csrc)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 7236d5b..ffe0c7a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -39,14 +39,26 @@ find_package(Torch REQUIRED)
 message(STATUS "Torch include include_directories: " ${TORCH_INCLUDE_DIRS})
 include_directories(${TORCH_INCLUDE_DIRS})
 
-# let cmake automatically detect the current CUDA architecture to avoid
-# generating device codes for all possible architectures
-set(CMAKE_CUDA_ARCHITECTURES OFF)
+if(USER_CUDA_ARCH_LIST)
+  message(STATUS "User specified CUDA architectures: ${USER_CUDA_ARCH_LIST}")
+  set(CMAKE_CUDA_ARCHITECTURES ${USER_CUDA_ARCH_LIST})
+else()
+  # let cmake automatically detect the current CUDA architecture to avoid
+  # generating device codes for all possible architectures
+  message(STATUS "No user specified CUDA architectures, cmake will detect the "
+                 "current CUDA architecture.")
+  set(CMAKE_CUDA_ARCHITECTURES ON)
+endif()
+
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}  --Werror all-warnings")
+
 # Set the CUDA_PROPAGATE_HOST_FLAGS to OFF to avoid passing host compiler flags
 # to the device compiler
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
+message(STATUS "my cuda architecture list: ${CUDA_ARCH_LIST}")
+message(STATUS "cmake cuda flags: ${CMAKE_CUDA_FLAGS}")
+
 # FIXME(ying): -std=c++17 has to be set explicitly here, Otherwise, linking
 # against torchlibs will raise errors. it seems that the host compilation
 # options are not passed to torchlibs.
@@ -64,6 +76,8 @@ set(CUDA_NVCC_FLAGS
     -U__CUDA_NO_BFLOAT162_CONVERSIONS__)
 set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --use_fast_math)
 
+message(STATUS "NVCC FLAGS = ${CUDA_NVCC_FLAGS}")
+
 if(${CUDA_VERSION_MAJOR} VERSION_GREATER_EQUAL "11")
   add_definitions("-DENABLE_BF16")
   message("CUDA_VERSION ${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} "
diff --git a/setup.py b/setup.py
index dadfe13..7d39dcd 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,6 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
-
 import os
 import subprocess
 from pathlib import Path
@@ -56,6 +55,10 @@ def build_extension(self, ext: CMakeExtension) -> None:
         ) if self.debug is None else self.debug
         cfg = "Debug" if debug else "Release"
 
+        # Set CUDA_ARCH_LIST to build the dynamic shared library
+        # for the specified GPU architectures.
+        arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+
         parallel_level = os.environ.get("CMAKE_BUILD_PARALLEL_LEVEL", None)
         if parallel_level is not None:
             self.parallel = int(parallel_level)
@@ -72,9 +75,11 @@ def build_extension(self, ext: CMakeExtension) -> None:
                 "-DCMAKE_BUILD_TYPE=%s" % cfg,
                 "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}".format(
                     cfg.upper(), extdir
-                ), "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}".format(
+                ),
+                "-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY_{}={}".format(
                     cfg.upper(), self.build_temp
-                )
+                ),
+                "-DCUDA_ARCH_LIST={}".format(arch_list) if arch_list else "",
             ]
 
             # Adding CMake arguments set as environment variable