Skip to content

Commit

Permalink
backend/cuda: use all three dimensions of the block grid
Browse files Browse the repository at this point in the history
This allows us to handle much larger pack/unpack sizes, and should be
sufficient for the forseeable future.

Fixes #17

Signed-off-by: Pavan Balaji <[email protected]>
  • Loading branch information
pavanbalaji committed Apr 13, 2020
1 parent 2128a88 commit 71ac217
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 13 deletions.
3 changes: 3 additions & 0 deletions src/backend/cuda/include/yaksuri_cudai.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#define CUDA_P2P_CLIQUES (3)

#define YAKSURI_CUDAI_THREAD_BLOCK_SIZE (256)
#define YAKSURI_CUDAI_MAX_GRIDSZ_X ((1ULL << 31) - 1)
#define YAKSURI_CUDAI_MAX_GRIDSZ_Y (65535)
#define YAKSURI_CUDAI_MAX_GRIDSZ_Z (65535)

/* *INDENT-OFF* */
#ifdef __cplusplus
Expand Down
61 changes: 48 additions & 13 deletions src/backend/cuda/pup/yaksuri_cudai_pup.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,25 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
YAKSU_ERR_CHECK(rc, fn_fail);

int n_threads = YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
int n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
uint64_t n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
n_blocks += ! !(count * cuda_type->num_elements % YAKSURI_CUDAI_THREAD_BLOCK_SIZE);

int n_blocks_x, n_blocks_y, n_blocks_z;
if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X) {
n_blocks_x = (int) n_blocks;
n_blocks_y = 1;
n_blocks_z = 1;
} else if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X * YAKSURI_CUDAI_MAX_GRIDSZ_Y) {
n_blocks_x = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Y);
n_blocks_y = YAKSU_CEIL(n_blocks, n_blocks_x);
n_blocks_z = 1;
} else {
int n_blocks_xy = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Z);
n_blocks_x = YAKSU_CEIL(n_blocks_xy, YAKSURI_CUDAI_MAX_GRIDSZ_Y);
n_blocks_y = YAKSU_CEIL(n_blocks_xy, n_blocks_x);
n_blocks_z = YAKSU_CEIL(n_blocks, (uintptr_t) n_blocks_x * n_blocks_y);
}

if ((inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeManaged) ||
(inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeManaged) ||
(inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeDevice &&
Expand All @@ -86,7 +102,8 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
}

cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1, target);
cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, n_blocks_y,
n_blocks_z, target);
} else if (inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeDevice) {
target = outattr.device;
cerr = cudaSetDevice(target);
Expand All @@ -98,7 +115,8 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
}

cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1, target);
cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, n_blocks_y,
n_blocks_z, target);
} else if ((outattr.type == cudaMemoryTypeDevice && inattr.device != outattr.device) ||
(outattr.type == cudaMemoryTypeHost)) {
assert(inattr.type == cudaMemoryTypeDevice);
Expand All @@ -113,10 +131,11 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
}

cuda_type->pack(inbuf, device_tmpbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1,
target);
cerr = cudaMemcpyAsync(outbuf, device_tmpbuf, count * type->size, cudaMemcpyDefault,
yaksuri_cudai_global.stream[target]);
cuda_type->pack(inbuf, device_tmpbuf, count, cuda_type->md, n_threads, n_blocks_x,
n_blocks_y, n_blocks_z, target);
cerr =
cudaMemcpyAsync(outbuf, device_tmpbuf, count * type->size, cudaMemcpyDefault,
yaksuri_cudai_global.stream[target]);
YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
} else {
rc = YAKSA_ERR__INTERNAL;
Expand Down Expand Up @@ -188,6 +207,22 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
int n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
n_blocks += ! !(count * cuda_type->num_elements % YAKSURI_CUDAI_THREAD_BLOCK_SIZE);

int n_blocks_x, n_blocks_y, n_blocks_z;
if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X) {
n_blocks_x = (int) n_blocks;
n_blocks_y = 1;
n_blocks_z = 1;
} else if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X * YAKSURI_CUDAI_MAX_GRIDSZ_Y) {
n_blocks_x = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Y);
n_blocks_y = YAKSU_CEIL(n_blocks, n_blocks_x);
n_blocks_z = 1;
} else {
int n_blocks_xy = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Z);
n_blocks_x = YAKSU_CEIL(n_blocks_xy, YAKSURI_CUDAI_MAX_GRIDSZ_Y);
n_blocks_y = YAKSU_CEIL(n_blocks_xy, n_blocks_x);
n_blocks_z = YAKSU_CEIL(n_blocks, (uintptr_t) n_blocks_x * n_blocks_y);
}

if ((inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeManaged) ||
(inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeDevice) ||
(inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeDevice &&
Expand All @@ -202,8 +237,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
}

cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1,
target);
cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
n_blocks_y, n_blocks_z, target);
} else if (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeManaged) {
target = inattr.device;
cerr = cudaSetDevice(target);
Expand All @@ -215,8 +250,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
}

cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1,
target);
cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
n_blocks_y, n_blocks_z, target);
} else if ((inattr.type == cudaMemoryTypeDevice && inattr.device != outattr.device) ||
(inattr.type == cudaMemoryTypeHost)) {
assert(outattr.type == cudaMemoryTypeDevice);
Expand All @@ -235,8 +270,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
yaksuri_cudai_global.stream[target]);
YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);

cuda_type->unpack(device_tmpbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1,
1, target);
cuda_type->unpack(device_tmpbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
n_blocks_y, n_blocks_z, target);
} else {
rc = YAKSA_ERR__INTERNAL;
goto fn_fail;
Expand Down
1 change: 1 addition & 0 deletions src/util/yaksu_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#define YAKSU_MAX(x, y) ((x) > (y) ? (x) : (y))
#define YAKSU_MIN(x, y) ((x) < (y) ? (x) : (y))
#define YAKSU_CEIL(x, y) (((x) / (y)) + !!((x) % (y)))

#define YAKSU_ERR_CHKANDJUMP(check, rc, errcode, label) \
do { \
Expand Down

0 comments on commit 71ac217

Please sign in to comment.