diff --git a/src/backend/cuda/include/yaksuri_cudai.h b/src/backend/cuda/include/yaksuri_cudai.h index 035af932..ccefb8f0 100644 --- a/src/backend/cuda/include/yaksuri_cudai.h +++ b/src/backend/cuda/include/yaksuri_cudai.h @@ -16,6 +16,9 @@ #define CUDA_P2P_CLIQUES (3) #define YAKSURI_CUDAI_THREAD_BLOCK_SIZE (256) +#define YAKSURI_CUDAI_MAX_GRIDSZ_X ((1ULL << 31) - 1) +#define YAKSURI_CUDAI_MAX_GRIDSZ_Y (65535) +#define YAKSURI_CUDAI_MAX_GRIDSZ_Z (65535) /* *INDENT-OFF* */ #ifdef __cplusplus diff --git a/src/backend/cuda/pup/yaksuri_cudai_pup.c b/src/backend/cuda/pup/yaksuri_cudai_pup.c index facda6e9..7c827b34 100644 --- a/src/backend/cuda/pup/yaksuri_cudai_pup.c +++ b/src/backend/cuda/pup/yaksuri_cudai_pup.c @@ -69,9 +69,25 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_ YAKSU_ERR_CHECK(rc, fn_fail); int n_threads = YAKSURI_CUDAI_THREAD_BLOCK_SIZE; - int n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE; + uint64_t n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE; n_blocks += ! !(count * cuda_type->num_elements % YAKSURI_CUDAI_THREAD_BLOCK_SIZE); + int n_blocks_x, n_blocks_y, n_blocks_z; + if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X) { + n_blocks_x = (int) n_blocks; + n_blocks_y = 1; + n_blocks_z = 1; + } else if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X * YAKSURI_CUDAI_MAX_GRIDSZ_Y) { + n_blocks_x = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Y); + n_blocks_y = YAKSU_CEIL(n_blocks, n_blocks_x); + n_blocks_z = 1; + } else { + int n_blocks_xy = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Z); + n_blocks_x = YAKSU_CEIL(n_blocks_xy, YAKSURI_CUDAI_MAX_GRIDSZ_Y); + n_blocks_y = YAKSU_CEIL(n_blocks_xy, n_blocks_x); + n_blocks_z = YAKSU_CEIL(n_blocks, (uintptr_t) n_blocks_x * n_blocks_y); + } + if ((inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeManaged) || (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeManaged) || (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeDevice && @@ -86,7 +102,8 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_ YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail); } - cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1, target); + cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, n_blocks_y, + n_blocks_z, target); } else if (inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeDevice) { target = outattr.device; cerr = cudaSetDevice(target); @@ -98,7 +115,8 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_ YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail); } - cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1, target); + cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, n_blocks_y, + n_blocks_z, target); } else if ((outattr.type == cudaMemoryTypeDevice && inattr.device != outattr.device) || (outattr.type == cudaMemoryTypeHost)) { assert(inattr.type == cudaMemoryTypeDevice); @@ -113,10 +131,11 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_ YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail); } - cuda_type->pack(inbuf, device_tmpbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1, - target); - cerr = cudaMemcpyAsync(outbuf, device_tmpbuf, count * type->size, cudaMemcpyDefault, - yaksuri_cudai_global.stream[target]); + cuda_type->pack(inbuf, device_tmpbuf, count, cuda_type->md, n_threads, n_blocks_x, + n_blocks_y, n_blocks_z, target); + cerr = + cudaMemcpyAsync(outbuf, device_tmpbuf, count * type->size, cudaMemcpyDefault, + yaksuri_cudai_global.stream[target]); YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail); } else { rc = YAKSA_ERR__INTERNAL; @@ -188,6 +207,22 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks int n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE; n_blocks += ! !(count * cuda_type->num_elements % YAKSURI_CUDAI_THREAD_BLOCK_SIZE); + int n_blocks_x, n_blocks_y, n_blocks_z; + if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X) { + n_blocks_x = (int) n_blocks; + n_blocks_y = 1; + n_blocks_z = 1; + } else if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X * YAKSURI_CUDAI_MAX_GRIDSZ_Y) { + n_blocks_x = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Y); + n_blocks_y = YAKSU_CEIL(n_blocks, n_blocks_x); + n_blocks_z = 1; + } else { + int n_blocks_xy = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Z); + n_blocks_x = YAKSU_CEIL(n_blocks_xy, YAKSURI_CUDAI_MAX_GRIDSZ_Y); + n_blocks_y = YAKSU_CEIL(n_blocks_xy, n_blocks_x); + n_blocks_z = YAKSU_CEIL(n_blocks, (uintptr_t) n_blocks_x * n_blocks_y); + } + if ((inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeManaged) || (inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeDevice) || (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeDevice && @@ -202,8 +237,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail); } - cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1, - target); + cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, + n_blocks_y, n_blocks_z, target); } else if (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeManaged) { target = inattr.device; cerr = cudaSetDevice(target); @@ -215,8 +250,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail); } - cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1, - target); + cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, + n_blocks_y, n_blocks_z, target); } else if ((inattr.type == cudaMemoryTypeDevice && inattr.device != outattr.device) || (inattr.type == cudaMemoryTypeHost)) { assert(outattr.type == cudaMemoryTypeDevice); @@ -235,8 +270,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks yaksuri_cudai_global.stream[target]); YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail); - cuda_type->unpack(device_tmpbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, - 1, target); + cuda_type->unpack(device_tmpbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, + n_blocks_y, n_blocks_z, target); } else { rc = YAKSA_ERR__INTERNAL; goto fn_fail; diff --git a/src/util/yaksu_base.h b/src/util/yaksu_base.h index cdb22626..6fa0e1bd 100644 --- a/src/util/yaksu_base.h +++ b/src/util/yaksu_base.h @@ -8,6 +8,7 @@ #define YAKSU_MAX(x, y) ((x) > (y) ? (x) : (y)) #define YAKSU_MIN(x, y) ((x) < (y) ? (x) : (y)) +#define YAKSU_CEIL(x, y) (((x) / (y)) + !!((x) % (y))) #define YAKSU_ERR_CHKANDJUMP(check, rc, errcode, label) \ do { \