Avoid need for 64-bit sycl IDs (#333)

joeatodd · web-flow · commit 9cc9df22fd90 · 2025-04-28T19:15:33.000+01:00
Change the range calculation for `BlockForEach` to ensure we don't
overflow int32. Revert the relevant DPC++ flag.
diff --git a/cmake/FindDPCPP.cmake b/cmake/FindDPCPP.cmake
@@ -38,7 +38,7 @@ find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
 
 add_library(DPCPP::DPCPP INTERFACE IMPORTED)
 
-set(DPCPP_FLAGS "-fsycl;-fno-sycl-id-queries-fit-in-int;")
+set(DPCPP_FLAGS "-fsycl;")
 set(DPCPP_COMPILE_ONLY_FLAGS "")
 
 if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "")
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -30,6 +30,7 @@
  **************************************************************************************************/
 #pragma once
 
+#include <limits>
 #include <stdexcept>
 #include "cutlass/cutlass.h"
 #include "cutlass/util/reference/device/kernel/tensor_foreach.h"
@@ -133,7 +134,8 @@ struct BlockForEach {
 #if defined (CUTLASS_ENABLE_SYCL)
       // TODO: query the queue for block size
       block_size = 128;
-      grid_size = cute::ceil_div(capacity, block_size);
+      // Ensure global range doesn't overflow int
+      grid_size = std::min(capacity, static_cast<size_t>(std::numeric_limits<int>::max())) / block_size;
 #else
       // if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
       cudaError_t result = cudaOccupancyMaxPotentialBlockSize(