diff --git a/src/backend/cuda/include/yaksuri_cudai.h b/src/backend/cuda/include/yaksuri_cudai.h
index 035af932..ccefb8f0 100644
--- a/src/backend/cuda/include/yaksuri_cudai.h
+++ b/src/backend/cuda/include/yaksuri_cudai.h
@@ -16,6 +16,9 @@
 #define CUDA_P2P_CLIQUES  (3)
 
 #define YAKSURI_CUDAI_THREAD_BLOCK_SIZE  (256)
+#define YAKSURI_CUDAI_MAX_GRIDSZ_X       ((1ULL << 31) - 1)
+#define YAKSURI_CUDAI_MAX_GRIDSZ_Y       (65535)
+#define YAKSURI_CUDAI_MAX_GRIDSZ_Z       (65535)
 
 /* *INDENT-OFF* */
 #ifdef __cplusplus
diff --git a/src/backend/cuda/pup/yaksuri_cudai_pup.c b/src/backend/cuda/pup/yaksuri_cudai_pup.c
index facda6e9..7c827b34 100644
--- a/src/backend/cuda/pup/yaksuri_cudai_pup.c
+++ b/src/backend/cuda/pup/yaksuri_cudai_pup.c
@@ -69,9 +69,25 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
         YAKSU_ERR_CHECK(rc, fn_fail);
 
         int n_threads = YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
-        int n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
+        uint64_t n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
         n_blocks += ! !(count * cuda_type->num_elements % YAKSURI_CUDAI_THREAD_BLOCK_SIZE);
 
+        int n_blocks_x, n_blocks_y, n_blocks_z;
+        if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X) {
+            n_blocks_x = (int) n_blocks;
+            n_blocks_y = 1;
+            n_blocks_z = 1;
+        } else if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X * YAKSURI_CUDAI_MAX_GRIDSZ_Y) {
+            n_blocks_x = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Y);
+            n_blocks_y = YAKSU_CEIL(n_blocks, n_blocks_x);
+            n_blocks_z = 1;
+        } else {
+            int n_blocks_xy = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Z);
+            n_blocks_x = YAKSU_CEIL(n_blocks_xy, YAKSURI_CUDAI_MAX_GRIDSZ_Y);
+            n_blocks_y = YAKSU_CEIL(n_blocks_xy, n_blocks_x);
+            n_blocks_z = YAKSU_CEIL(n_blocks, (uintptr_t) n_blocks_x * n_blocks_y);
+        }
+
         if ((inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeManaged) ||
             (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeManaged) ||
             (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeDevice &&
@@ -86,7 +102,8 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
             }
 
-            cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1, target);
+            cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, n_blocks_y,
+                            n_blocks_z, target);
         } else if (inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeDevice) {
             target = outattr.device;
             cerr = cudaSetDevice(target);
@@ -98,7 +115,8 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
             }
 
-            cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1, target);
+            cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, n_blocks_y,
+                            n_blocks_z, target);
         } else if ((outattr.type == cudaMemoryTypeDevice && inattr.device != outattr.device) ||
                    (outattr.type == cudaMemoryTypeHost)) {
             assert(inattr.type == cudaMemoryTypeDevice);
@@ -113,10 +131,11 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
             }
 
-            cuda_type->pack(inbuf, device_tmpbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1,
-                            target);
-            cerr = cudaMemcpyAsync(outbuf, device_tmpbuf, count * type->size, cudaMemcpyDefault,
-                                   yaksuri_cudai_global.stream[target]);
+            cuda_type->pack(inbuf, device_tmpbuf, count, cuda_type->md, n_threads, n_blocks_x,
+                            n_blocks_y, n_blocks_z, target);
+            cerr =
+                cudaMemcpyAsync(outbuf, device_tmpbuf, count * type->size, cudaMemcpyDefault,
+                                yaksuri_cudai_global.stream[target]);
             YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
         } else {
             rc = YAKSA_ERR__INTERNAL;
@@ -188,6 +207,22 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
         int n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
         n_blocks += ! !(count * cuda_type->num_elements % YAKSURI_CUDAI_THREAD_BLOCK_SIZE);
 
+        int n_blocks_x, n_blocks_y, n_blocks_z;
+        if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X) {
+            n_blocks_x = (int) n_blocks;
+            n_blocks_y = 1;
+            n_blocks_z = 1;
+        } else if (n_blocks <= YAKSURI_CUDAI_MAX_GRIDSZ_X * YAKSURI_CUDAI_MAX_GRIDSZ_Y) {
+            n_blocks_x = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Y);
+            n_blocks_y = YAKSU_CEIL(n_blocks, n_blocks_x);
+            n_blocks_z = 1;
+        } else {
+            int n_blocks_xy = YAKSU_CEIL(n_blocks, YAKSURI_CUDAI_MAX_GRIDSZ_Z);
+            n_blocks_x = YAKSU_CEIL(n_blocks_xy, YAKSURI_CUDAI_MAX_GRIDSZ_Y);
+            n_blocks_y = YAKSU_CEIL(n_blocks_xy, n_blocks_x);
+            n_blocks_z = YAKSU_CEIL(n_blocks, (uintptr_t) n_blocks_x * n_blocks_y);
+        }
+
         if ((inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeManaged) ||
             (inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeDevice) ||
             (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeDevice &&
@@ -202,8 +237,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
             }
 
-            cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1,
-                              target);
+            cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
+                              n_blocks_y, n_blocks_z, target);
         } else if (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeManaged) {
             target = inattr.device;
             cerr = cudaSetDevice(target);
@@ -215,8 +250,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
             }
 
-            cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1, 1,
-                              target);
+            cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
+                              n_blocks_y, n_blocks_z, target);
         } else if ((inattr.type == cudaMemoryTypeDevice && inattr.device != outattr.device) ||
                    (inattr.type == cudaMemoryTypeHost)) {
             assert(outattr.type == cudaMemoryTypeDevice);
@@ -235,8 +270,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
                                    yaksuri_cudai_global.stream[target]);
             YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
 
-            cuda_type->unpack(device_tmpbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, 1,
-                              1, target);
+            cuda_type->unpack(device_tmpbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
+                              n_blocks_y, n_blocks_z, target);
         } else {
             rc = YAKSA_ERR__INTERNAL;
             goto fn_fail;
diff --git a/src/util/yaksu_base.h b/src/util/yaksu_base.h
index cdb22626..6fa0e1bd 100644
--- a/src/util/yaksu_base.h
+++ b/src/util/yaksu_base.h
@@ -8,6 +8,7 @@
 
 #define YAKSU_MAX(x, y)  ((x) > (y) ? (x) : (y))
 #define YAKSU_MIN(x, y)  ((x) < (y) ? (x) : (y))
+#define YAKSU_CEIL(x, y) (((x) / (y)) + !!((x) % (y)))
 
 #define YAKSU_ERR_CHKANDJUMP(check, rc, errcode, label) \
     do {                                                \