Add --target-cuda argument for selecting CUDA architecture (#2478)

vlad-perevezentsev · web-flow · commit 32e1b4eafbd3 · 2025-06-16T14:33:45.000+02:00
This PR suggests adding `--target-cuda` argument to
`scripts/build_locally.py` allowing to enable CUDA support and
optionally specify the target architecture (e.g. `sm_80`).
If no architecture is specified, `sm_50` is used by default.
```bash
$ python scripts/build_locally.py --target-cuda
# or
$ python scripts/build_locally.py --target-cuda=&lt;arch&gt;
```
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+* Added `--target-cuda[=ARCH]` option to replace the deprecated `--target=cuda`, allowing users to build for CUDA devices with optional architecture selection using [CodePlay oneAPI plug-in](https://developer.codeplay.com/products/oneapi/nvidia/home/) [#2478](https://github.com/IntelPython/dpnp/pull/2478)
+
 ### Changed
 
 * Adjusted the `pre-commit` configuration to run autoupdate weekly [#2479](https://github.com/IntelPython/dpnp/pull/2479)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -68,14 +68,17 @@ find_package(Dpctl REQUIRED)
 message(STATUS "Dpctl_INCLUDE_DIR=" ${Dpctl_INCLUDE_DIR})
 message(STATUS "Dpctl_TENSOR_INCLUDE_DIR=" ${Dpctl_TENSOR_INCLUDE_DIR})
 
-option(DPNP_TARGET_CUDA
-    "Build DPNP to target CUDA devices"
-    OFF
-)
 option(DPNP_USE_ONEMKL_INTERFACES
     "Build DPNP with oneMKL Interfaces"
     OFF
 )
+set(DPNP_TARGET_CUDA
+    ""
+    CACHE STRING
+    "Build DPNP to target CUDA device. \
+Set to a truthy value (e.g., ON, TRUE) to use default architecture (sm_50), \
+or to a specific architecture like sm_80."
+)
 set(HIP_TARGETS "" CACHE STRING "HIP architecture for target")
 
 set(_dpnp_sycl_targets)
@@ -87,8 +90,19 @@ set(_dpnp_sycl_target_compile_options)
 set(_dpnp_sycl_target_link_options)
 
 if ("x${DPNP_SYCL_TARGETS}" STREQUAL "x")
-    if(DPNP_TARGET_CUDA)
-        set(_dpnp_sycl_targets "nvptx64-nvidia-cuda,spir64-unknown-unknown")
+    if (DPNP_TARGET_CUDA)
+        set(_dpnp_cuda_arch)
+        if(DPNP_TARGET_CUDA MATCHES "^sm_")
+            set(_dpnp_cuda_arch ${DPNP_TARGET_CUDA})
+        elseif(DPNP_TARGET_CUDA MATCHES "^(ON|TRUE|YES|Y|1)$")
+            set(_dpnp_cuda_arch "sm_50")
+        else()
+            message(FATAL_ERROR
+                "Invalid value for DPNP_TARGET_CUDA: \"${DPNP_TARGET_CUDA}\". "
+                "Expected 'ON', 'TRUE', 'YES', 'Y', '1', or a CUDA architecture like 'sm_80'."
+            )
+        endif()
+        set(_dpnp_sycl_targets "nvidia_gpu_${_dpnp_cuda_arch},spir64-unknown-unknown")
         set(_use_onemkl_interfaces_cuda ON)
     endif()
 
@@ -104,7 +118,7 @@ if ("x${DPNP_SYCL_TARGETS}" STREQUAL "x")
 else()
     set(_dpnp_sycl_targets ${DPNP_SYCL_TARGETS})
 
-    if ("${DPNP_SYCL_TARGETS}" MATCHES "nvptx64-nvidia-cuda")
+    if("${DPNP_SYCL_TARGETS}" MATCHES "(nvidia_gpu_sm_|nvptx64-nvidia-cuda)")
         set(_use_onemkl_interfaces_cuda ON)
     endif()
 
diff --git a/doc/quick_start_guide.rst b/doc/quick_start_guide.rst
@@ -144,13 +144,40 @@ installation layout of compatible version. The following plugins from CodePlay a
 Building ``dpnp`` also requires `building Data Parallel Control Library for custom SYCL targets.
 <https://intelpython.github.io/dpctl/latest/beginners_guides/installation.html#building-for-custom-sycl-targets>`_
 
-``dpnp`` can be built for CUDA devices as follows:
+Builds for CUDA and AMD devices internally use SYCL alias targets that are passed to the compiler.
+A full list of available SYCL alias targets is available in the
+`DPC++ Compiler User Manual <https://intel.github.io/llvm/UsersManual.html>`_.
+
+CUDA build
+~~~~~~~~~~
+
+To build for CUDA devices, use the ``--target-cuda`` argument.
+
+To target a specific architecture (e.g., ``sm_80``):
+
+.. code-block:: bash
+
+    python scripts/build_locally.py --target-cuda=sm_80
+
+To use the default architecture (``sm_50``), run:
 
 .. code-block:: bash
 
-    python scripts/build_locally.py --target=cuda
+    python scripts/build_locally.py --target-cuda
+
+Note that kernels are built for the default architecture (``sm_50``), allowing them to work on a
+wider range of architectures, but limiting the usage of more recent CUDA features.
+
+For reference, compute architecture strings like ``sm_80`` correspond to specific
+CUDA Compute Capabilities (e.g., Compute Capability 8.0 corresponds to ``sm_80``).
+A complete mapping between NVIDIA GPU models and their respective
+Compute Capabilities can be found in the official
+`CUDA GPU Compute Capability <https://developer.nvidia.com/cuda-gpus>`_ documentation.
+
+AMD build
+~~~~~~~~~
 
-And for AMD devices:
+To build for AMD devices, use the ``--target-hip=<arch>`` argument:
 
 .. code-block:: bash
 
@@ -173,13 +200,17 @@ For example:
 .. code-block:: bash
     python scripts/build_locally.py --target-hip=gfx90a
 
+Multi-target build
+~~~~~~~~~~~~~~~~~~
 
-It is, however, possible to build for Intel devices, CUDA devices, and an AMD device
-architecture all at once:
+The default ``dpnp`` build from the source enables support of Intel devices only.
+Extending the build with a custom SYCL target additionally enables support of CUDA or AMD
+device in ``dpnp``. Besides, the support can be also extended to enable both CUDA and AMD
+devices at the same time:
 
 .. code-block:: bash
 
-    python scripts/build_locally.py --target=cuda --target-hip=gfx90a
+    python scripts/build_locally.py --target-cuda --target-hip=gfx90a
 
 
 Testing
diff --git a/scripts/build_locally.py b/scripts/build_locally.py
@@ -38,7 +38,7 @@ def run(
     cmake_executable=None,
     verbose=False,
     cmake_opts="",
-    target="intel",
+    target_cuda=None,
     target_hip=None,
     onemkl_interfaces=False,
     onemkl_interfaces_dir=None,
@@ -98,12 +98,14 @@ def run(
         if "DPL_ROOT" in os.environ:
             os.environ["DPL_ROOT_HINT"] = os.environ["DPL_ROOT"]
 
-    if not target.strip():
-        target = "intel"
-
-    if target == "cuda":
+    if target_cuda is not None:
+        if not target_cuda.strip():
+            raise ValueError(
+                "--target-cuda can not be an empty string. "
+                "Use --target-cuda=<arch> or --target-cuda"
+            )
         cmake_args += [
-            "-DDPNP_TARGET_CUDA=ON",
+            f"-DDPNP_TARGET_CUDA={target_cuda}",
         ]
         # Always builds using oneMKL interfaces for the cuda target
         onemkl_interfaces = True
@@ -129,7 +131,7 @@ def run(
                 f"-DDPNP_ONEMKL_INTERFACES_DIR={onemkl_interfaces_dir}",
             ]
     elif onemkl_interfaces_dir:
-        RuntimeError("--onemkl-interfaces-dir option is not supported")
+        raise RuntimeError("--onemkl-interfaces-dir option is not supported")
 
     subprocess.check_call(
         cmake_args, shell=False, cwd=setup_dir, env=os.environ
@@ -186,10 +188,12 @@ def run(
         type=str,
     )
     driver.add_argument(
-        "--target",
-        help="Target backend for build",
-        dest="target",
-        default="intel",
+        "--target-cuda",
+        nargs="?",
+        const="ON",
+        help="Enable CUDA target for build; "
+        "optionally specify architecture (e.g., --target-cuda=sm_80)",
+        default=None,
         type=str,
     )
     driver.add_argument(
@@ -265,7 +269,7 @@ def run(
         cmake_executable=args.cmake_executable,
         verbose=args.verbose,
         cmake_opts=args.cmake_opts,
-        target=args.target,
+        target_cuda=args.target_cuda,
         target_hip=args.target_hip,
         onemkl_interfaces=args.onemkl_interfaces,
         onemkl_interfaces_dir=args.onemkl_interfaces_dir,