oneapi-src · kbenzie · Nov 18, 2024
@@ -108,18 +108,25 @@ function(add_ur_target_compile_options name)
     elseif(MSVC)
         target_compile_options(${name} PRIVATE
             $<$<CXX_COMPILER_ID:MSVC>:/MP>  # clang-cl.exe does not support /MP
-            /W3
+            /W4
+            /wd4456  # Disable: declaration of 'identifier' hides previous local declaration
+            /wd4457  # Disable: declaration of 'identifier' hides function parameter
+            /wd4458  # Disable: declaration of 'identifier' hides class member
+            /wd4459  # Disable: declaration of 'identifier' hides global declaration
             /MD$<$<CONFIG:Debug>:d>
             /GS
             /DWIN32_LEAN_AND_MEAN
             /DNOMINMAX
         )
 
-        if(UR_DEVELOPER_MODE)
+        target_compile_definitions(${name} PRIVATE
             # _CRT_SECURE_NO_WARNINGS used mainly because of getenv
-            # C4267: The compiler detected a conversion from size_t to a smaller type.
+            _CRT_SECURE_NO_WARNINGS
+        )
+
+        if(UR_DEVELOPER_MODE)
             target_compile_options(${name} PRIVATE
-                /WX /GS /D_CRT_SECURE_NO_WARNINGS /wd4267
+                /WX /GS
             )
         endif()
     endif()

@@ -25,7 +25,14 @@
 #include <string_view>
 
 #include "ur_api.h"
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4245)
+#endif
 #include "xpti/xpti_trace_framework.h"
+#ifdef _MSC_VER
+#pragma warning(default : 4245)
+#endif
 
 constexpr uint16_t TRACE_FN_BEGIN =
     static_cast<uint16_t>(xpti::trace_point_type_t::function_with_args_begin);

@@ -426,7 +426,7 @@ typedef struct ur_physical_mem_handle_t_ *ur_physical_mem_handle_t;
 ///////////////////////////////////////////////////////////////////////////////
 #ifndef UR_BIT
 /// @brief Generic macro for enumerator bit masks
-#define UR_BIT(_i) (1 << _i)
+#define UR_BIT(_i) (1U << _i)
 #endif // UR_BIT
 
 ///////////////////////////////////////////////////////////////////////////////

@@ -134,7 +134,7 @@ name: "$x_physical_mem_handle_t"
 type: macro
 desc: "Generic macro for enumerator bit masks"
 name: "$X_BIT( _i )"
-value: "( 1 << _i )"
+value: "( 1U << _i )"
 --- #--------------------------------------------------------------------------
 type: enum
 desc: "Defines Return/Error codes"

@@ -97,15 +97,16 @@ if (UR_ENABLE_TRACING)
     get_target_property(XPTI_SRC_DIR xpti SOURCE_DIR)
     set(XPTI_PROXY_SRC "${XPTI_SRC_DIR}/xpti_proxy.cpp")
   endif()
-  target_compile_definitions(${TARGET_NAME} PRIVATE
+  add_library(cuda-xpti-proxy STATIC ${XPTI_PROXY_SRC})
+  target_compile_definitions(cuda-xpti-proxy PRIVATE
     XPTI_ENABLE_INSTRUMENTATION
     XPTI_STATIC_LIBRARY
     )
-  target_include_directories(${TARGET_NAME} PRIVATE
+  target_include_directories(cuda-xpti-proxy PRIVATE
     ${XPTI_INCLUDES}
     ${CUDA_CUPTI_INCLUDE_DIR}
   )
-  target_sources(${TARGET_NAME} PRIVATE ${XPTI_PROXY_SRC})
+  target_link_libraries(${TARGET_NAME} PRIVATE cuda-xpti-proxy)
 endif()
 
 if (CUDA_cupti_LIBRARY)

@@ -242,7 +242,7 @@ static ur_result_t enqueueCommandBufferFillHelper(
     if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
       CUDA_MEMSET_NODE_PARAMS NodeParams = {};
       NodeParams.dst = DstPtr;
-      NodeParams.elementSize = PatternSize;
+      NodeParams.elementSize = static_cast<unsigned int>(PatternSize);
       NodeParams.height = N;
       NodeParams.pitch = PatternSize;
       NodeParams.width = 1;
@@ -508,12 +508,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
     auto &ArgIndices = hKernel->getArgIndices();
     CUDA_KERNEL_NODE_PARAMS NodeParams = {};
     NodeParams.func = CuFunc;
-    NodeParams.gridDimX = BlocksPerGrid[0];
-    NodeParams.gridDimY = BlocksPerGrid[1];
-    NodeParams.gridDimZ = BlocksPerGrid[2];
-    NodeParams.blockDimX = ThreadsPerBlock[0];
-    NodeParams.blockDimY = ThreadsPerBlock[1];
-    NodeParams.blockDimZ = ThreadsPerBlock[2];
+    NodeParams.gridDimX = static_cast<unsigned int>(BlocksPerGrid[0]);
+    NodeParams.gridDimY = static_cast<unsigned int>(BlocksPerGrid[1]);
+    NodeParams.gridDimZ = static_cast<unsigned int>(BlocksPerGrid[2]);
+    NodeParams.blockDimX = static_cast<unsigned int>(ThreadsPerBlock[0]);
+    NodeParams.blockDimY = static_cast<unsigned int>(ThreadsPerBlock[1]);
+    NodeParams.blockDimZ = static_cast<unsigned int>(ThreadsPerBlock[2]);
     NodeParams.sharedMemBytes = LocalSize;
     NodeParams.kernelParams = const_cast<void **>(ArgIndices.data());
 
@@ -1397,12 +1397,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
   CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params;
 
   Params.func = CuFunc;
-  Params.gridDimX = BlocksPerGrid[0];
-  Params.gridDimY = BlocksPerGrid[1];
-  Params.gridDimZ = BlocksPerGrid[2];
-  Params.blockDimX = ThreadsPerBlock[0];
-  Params.blockDimY = ThreadsPerBlock[1];
-  Params.blockDimZ = ThreadsPerBlock[2];
+  Params.gridDimX = static_cast<unsigned int>(BlocksPerGrid[0]);
+  Params.gridDimY = static_cast<unsigned int>(BlocksPerGrid[1]);
+  Params.gridDimZ = static_cast<unsigned int>(BlocksPerGrid[2]);
+  Params.blockDimX = static_cast<unsigned int>(ThreadsPerBlock[0]);
+  Params.blockDimY = static_cast<unsigned int>(ThreadsPerBlock[1]);
+  Params.blockDimZ = static_cast<unsigned int>(ThreadsPerBlock[2]);
   Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize();
   Params.kernelParams =
       const_cast<void **>(KernelCommandHandle->Kernel->getArgIndices().data());

@@ -1153,7 +1153,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform,
 
   try {
     if (pNumDevices) {
-      *pNumDevices = NumDevices;
+      *pNumDevices = static_cast<uint32_t>(NumDevices);
     }
 
     if (ReturnDevices && phDevices) {
@@ -1236,7 +1236,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
 ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
                                                    uint64_t *pDeviceTimestamp,
                                                    uint64_t *pHostTimestamp) {
-  CUevent Event;
+  CUevent Event{};
   ScopedContext Active(hDevice);
 
   if (pDeviceTimestamp) {

@@ -160,7 +160,7 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
   int MinGrid, MaxBlockSize;
   UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
       &MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
-      MaxBlockDim[0]));
+      static_cast<int>(MaxBlockDim[0])));
 
   roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
                                        MaxBlockDim, MaxBlockSize);
@@ -208,7 +208,7 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
       MaxWorkGroupSize = Device->getMaxWorkGroupSize();
 
       if (ProvidedLocalWorkGroupSize) {
-        auto IsValid = [&](int Dim) {
+        auto IsValid = [&](size_t Dim) {
           if (ReqdThreadsPerBlock[Dim] != 0 &&
               LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
@@ -217,7 +217,8 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
               LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
 
-          if (LocalWorkSize[Dim] > Device->getMaxWorkItemSizes(Dim))
+          if (LocalWorkSize[Dim] >
+              Device->getMaxWorkItemSizes(static_cast<int>(Dim)))
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
           // Checks that local work sizes are a divisor of the global work sizes
           // which includes that the local work sizes are neither larger than
@@ -489,9 +490,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
     auto &ArgIndices = hKernel->getArgIndices();
     UR_CHECK_ERROR(cuLaunchKernel(
-        CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
-        ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
-        CuStream, const_cast<void **>(ArgIndices.data()), nullptr));
+        CuFunc, static_cast<unsigned int>(BlocksPerGrid[0]),
+        static_cast<unsigned int>(BlocksPerGrid[1]),
+        static_cast<unsigned int>(BlocksPerGrid[2]),
+        static_cast<unsigned int>(ThreadsPerBlock[0]),
+        static_cast<unsigned int>(ThreadsPerBlock[1]),
+        static_cast<unsigned int>(ThreadsPerBlock[2]), LocalSize, CuStream,
+        const_cast<void **>(ArgIndices.data()), nullptr));
 
     if (LocalSize != 0)
       hKernel->clearLocalSize();
@@ -657,12 +662,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     auto &ArgIndices = hKernel->getArgIndices();
 
     CUlaunchConfig launch_config;
-    launch_config.gridDimX = BlocksPerGrid[0];
-    launch_config.gridDimY = BlocksPerGrid[1];
-    launch_config.gridDimZ = BlocksPerGrid[2];
-    launch_config.blockDimX = ThreadsPerBlock[0];
-    launch_config.blockDimY = ThreadsPerBlock[1];
-    launch_config.blockDimZ = ThreadsPerBlock[2];
+    launch_config.gridDimX = static_cast<unsigned int>(BlocksPerGrid[0]);
+    launch_config.gridDimY = static_cast<unsigned int>(BlocksPerGrid[1]);
+    launch_config.gridDimZ = static_cast<unsigned int>(BlocksPerGrid[2]);
+    launch_config.blockDimX = static_cast<unsigned int>(ThreadsPerBlock[0]);
+    launch_config.blockDimY = static_cast<unsigned int>(ThreadsPerBlock[1]);
+    launch_config.blockDimZ = static_cast<unsigned int>(ThreadsPerBlock[2]);
 
     launch_config.sharedMemBytes = LocalSize;
     launch_config.hStream = CuStream;
@@ -1075,8 +1080,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       break;
     }
     default: {
-      UR_CHECK_ERROR(commonMemSetLargePattern(Stream, patternSize, size,
-                                              pPattern, DstDevice));
+      UR_CHECK_ERROR(
+          commonMemSetLargePattern(Stream, static_cast<uint32_t>(patternSize),
+                                   size, pPattern, DstDevice));
       break;
     }
     }
@@ -1108,7 +1114,6 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
     return 4;
   default:
     detail::ur::die("Invalid image format.");
-    return 0;
   }
 }
 
@@ -1212,7 +1217,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     CUDA_ARRAY_DESCRIPTOR ArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
 
-    int ElementByteSize = imageElementByteSize(ArrayDesc);
+    int ElementByteSize = static_cast<int>(imageElementByteSize(ArrayDesc));
 
     size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
     size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
@@ -1285,7 +1290,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
     CUDA_ARRAY_DESCRIPTOR ArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
 
-    int ElementByteSize = imageElementByteSize(ArrayDesc);
+    int ElementByteSize = static_cast<int>(imageElementByteSize(ArrayDesc));
 
     size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
     size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
@@ -1364,7 +1369,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
     UR_ASSERT(SrcArrayDesc.NumChannels == DstArrayDesc.NumChannels,
               UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-    int ElementByteSize = imageElementByteSize(SrcArrayDesc);
+    int ElementByteSize = static_cast<int>(imageElementByteSize(SrcArrayDesc));
 
     size_t DstByteOffsetX =
         dstOrigin.x * ElementByteSize * SrcArrayDesc.NumChannels;
@@ -1549,8 +1554,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
           CuStream));
       break;
     default:
-      commonMemSetLargePattern(CuStream, patternSize, size, pPattern,
-                               (CUdeviceptr)ptr);
+      commonMemSetLargePattern(CuStream, static_cast<uint32_t>(patternSize),
+                               size, pPattern, (CUdeviceptr)ptr);
       break;
     }
     if (phEvent) {

@@ -284,8 +284,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp(
   ur_result_t Result = UR_RESULT_SUCCESS;
   try {
     ScopedContext Active(hDevice);
-    UR_CHECK_ERROR(cuMemAllocPitch((CUdeviceptr *)ppMem, pResultPitch,
-                                   widthInBytes, height, elementSizeBytes));
+    UR_CHECK_ERROR(
+        cuMemAllocPitch((CUdeviceptr *)ppMem, pResultPitch, widthInBytes,
+                        height, static_cast<unsigned int>(elementSizeBytes)));
   } catch (ur_result_t error) {
     Result = error;
   } catch (...) {

@@ -203,8 +203,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
 
     int MaxNumActiveGroupsPerCU{0};
     UR_CHECK_ERROR(cuOccupancyMaxActiveBlocksPerMultiprocessor(
-        &MaxNumActiveGroupsPerCU, hKernel->get(), localWorkSize,
-        dynamicSharedMemorySize));
+        &MaxNumActiveGroupsPerCU, hKernel->get(),
+        static_cast<int>(localWorkSize), dynamicSharedMemorySize));
     detail::ur::assertion(MaxNumActiveGroupsPerCU >= 0);
     // Handle the case where we can't have all SMs active with at least 1 group
     // per SM. In that case, the device is still able to run 1 work-group, hence

@@ -97,8 +97,8 @@ struct ur_kernel_handle_t_ {
       }
       ParamSizes[Index] = Size;
       // calculate the insertion point on the array
-      size_t InsertPos = std::accumulate(std::begin(ParamSizes),
-                                         std::begin(ParamSizes) + Index, 0);
+      size_t InsertPos = std::accumulate(
+          std::begin(ParamSizes), std::begin(ParamSizes) + Index, size_t{0});
       // Update the stored value for the argument
       std::memcpy(&Storage[InsertPos], Arg, Size);
       Indices[Index] = &Storage[InsertPos];
@@ -152,8 +152,8 @@ struct ur_kernel_handle_t_ {
     const args_index_t &getIndices() const noexcept { return Indices; }
 
     uint32_t getLocalSize() const {
-      return std::accumulate(std::begin(OffsetPerIndex),
-                             std::end(OffsetPerIndex), 0);
+      return static_cast<uint32_t>(std::accumulate(
+          std::begin(OffsetPerIndex), std::end(OffsetPerIndex), size_t{0}));
     }
   } Args;
 

@@ -148,8 +148,8 @@ ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
   }
 
   UR_CHECK_ERROR(cuModuleLoadDataEx(&Module, static_cast<const void *>(Binary),
-                                    Options.size(), Options.data(),
-                                    OptionVals.data()));
+                                    static_cast<unsigned int>(Options.size()),
+                                    Options.data(), OptionVals.data()));
 
   BuildStatus = UR_PROGRAM_BUILD_STATUS_SUCCESS;
 

@@ -325,7 +325,7 @@ umf_result_t USMMemoryProvider::initialize(ur_context_handle_t Ctx,
 
 enum umf_result_t USMMemoryProvider::alloc(size_t Size, size_t Align,
                                            void **Ptr) {
-  auto Res = allocateImpl(Ptr, Size, Align);
+  auto Res = allocateImpl(Ptr, Size, static_cast<uint32_t>(Align));
   if (Res != UR_RESULT_SUCCESS) {
     getLastStatusRef() = Res;
     return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC;

@@ -89,18 +89,19 @@ if(UR_BUILD_ADAPTER_L0)
     endif()
 
     # TODO: fix level_zero adapter conversion warnings
+    # C4267: The compiler detected a conversion from size_t to a smaller type.
     target_compile_options(ur_adapter_level_zero PRIVATE
-        $<$<CXX_COMPILER_ID:MSVC>:/wd4805 /wd4244>
+        $<$<CXX_COMPILER_ID:MSVC>:/wd4805 /wd4244 /wd4267>
     )
 
     set_target_properties(ur_adapter_level_zero PROPERTIES
         VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}"
         SOVERSION "${PROJECT_VERSION_MAJOR}"
     )
 
-    if (WIN32)
-    # 0x800: Search for the DLL only in the System32 folder
-    target_link_options(ur_adapter_level_zero PRIVATE LINKER:/DEPENDENTLOADFLAG:0x800)
+    if(CMAKE_CXX_COMPILER_LINKER_ID MATCHES MSVC)
+        # 0x800: Search for the DLL only in the System32 folder
+        target_link_options(ur_adapter_level_zero PRIVATE LINKER:/DEPENDENTLOADFLAG:0x800)
     endif()
 
     target_link_libraries(ur_adapter_level_zero PRIVATE
@@ -183,18 +184,19 @@ if(UR_BUILD_ADAPTER_L0_V2)
     target_compile_definitions(ur_adapter_level_zero_v2 PUBLIC UR_ADAPTER_LEVEL_ZERO_V2)
 
     # TODO: fix level_zero adapter conversion warnings
+    # C4267: The compiler detected a conversion from size_t to a smaller type.
     target_compile_options(ur_adapter_level_zero_v2 PRIVATE
-        $<$<CXX_COMPILER_ID:MSVC>:/wd4805 /wd4244>
+        $<$<CXX_COMPILER_ID:MSVC>:/wd4805 /wd4244 /wd4100 /wd4267>
     )
 
     set_target_properties(ur_adapter_level_zero_v2 PROPERTIES
         VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}"
         SOVERSION "${PROJECT_VERSION_MAJOR}"
     )
 
-    if (WIN32)
-    # 0x800: Search for the DLL only in the System32 folder
-    target_link_options(ur_adapter_level_zero_v2 PUBLIC LINKER:/DEPENDENTLOADFLAG:0x800)
+    if(CMAKE_CXX_COMPILER_LINKER_ID MATCHES MSVC)
+        # 0x800: Search for the DLL only in the System32 folder
+        target_link_options(ur_adapter_level_zero_v2 PUBLIC LINKER:/DEPENDENTLOADFLAG:0x800)
     endif()
 
     target_link_libraries(ur_adapter_level_zero_v2 PRIVATE

@@ -514,7 +514,5 @@ ur_result_t urAdapterGetInfo(ur_adapter_handle_t, ur_adapter_info_t PropName,
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
   }
-
-  return UR_RESULT_SUCCESS;
 }
 } // namespace ur::level_zero
@@ -759,7 +759,7 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
         // queue's map to hold the fence and other associated command
         // list information.
         auto &QGroup = Queue->getQueueGroup(UseCopyEngine);
-        uint32_t QueueGroupOrdinal;
+        uint32_t QueueGroupOrdinal = 0;
         auto &ZeCommandQueue = ForcedCmdQueue
                                    ? *ForcedCmdQueue
                                    : QGroup.getZeQueue(&QueueGroupOrdinal);