Skip to content

Commit 9cc9df2

Browse files
authored
Avoid need for 64-bit sycl IDs (#333)
Change the range calculation for `BlockForEach` to ensure we don't overflow int32. Revert the relevant DPC++ flag.
1 parent d736d1c commit 9cc9df2

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

cmake/FindDPCPP.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ find_library(DPCPP_LIB_DIR NAMES sycl sycl6 PATHS "${DPCPP_BIN_DIR}/../lib")
3838

3939
add_library(DPCPP::DPCPP INTERFACE IMPORTED)
4040

41-
set(DPCPP_FLAGS "-fsycl;-fno-sycl-id-queries-fit-in-int;")
41+
set(DPCPP_FLAGS "-fsycl;")
4242
set(DPCPP_COMPILE_ONLY_FLAGS "")
4343

4444
if(NOT "${DPCPP_SYCL_TARGET}" STREQUAL "")

tools/util/include/cutlass/util/reference/device/tensor_foreach.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
**************************************************************************************************/
3131
#pragma once
3232

33+
#include <limits>
3334
#include <stdexcept>
3435
#include "cutlass/cutlass.h"
3536
#include "cutlass/util/reference/device/kernel/tensor_foreach.h"
@@ -133,7 +134,8 @@ struct BlockForEach {
133134
#if defined (CUTLASS_ENABLE_SYCL)
134135
// TODO: query the queue for block size
135136
block_size = 128;
136-
grid_size = cute::ceil_div(capacity, block_size);
137+
// Ensure global range doesn't overflow int
138+
grid_size = std::min(capacity, static_cast<size_t>(std::numeric_limits<int>::max())) / block_size;
137139
#else
138140
// if grid_size or block_size are zero, query occupancy using the CUDA Occupancy API
139141
cudaError_t result = cudaOccupancyMaxPotentialBlockSize(

0 commit comments

Comments
 (0)