pmodels · pavanbalaji · Apr 13, 2020 · Apr 13, 2020 · Apr 13, 2020 · gcongiu
diff --git a/src/backend/cuda/genpup.py b/src/backend/cuda/genpup.py
@@ -227,11 +227,11 @@ def generate_kernels(b, darray):
             display("}\n\n")
 
             # generate the host function
-            OUTFILE.write("void yaksuri_cudai_%s(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks, int device)\n" % funcprefix)
+            OUTFILE.write("void yaksuri_cudai_%s(const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s *md, int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device)\n" % funcprefix)
             OUTFILE.write("{\n")
             OUTFILE.write("    void *args[] = { &inbuf, &outbuf, &count, &md };\n")
             OUTFILE.write("    cudaError_t cerr = cudaLaunchKernel((const void *) yaksuri_cudai_kernel_%s,\n" % funcprefix)
-            OUTFILE.write("                dim3(n_blocks), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);\n")
+            OUTFILE.write("                dim3(n_blocks_x, n_blocks_y, n_blocks_z), dim3(n_threads), args, 0, yaksuri_cudai_global.stream[device]);\n")
             OUTFILE.write("    YAKSURI_CUDAI_CUDA_ERR_CHECK(cerr);\n")
             OUTFILE.write("}\n\n")
 
@@ -427,7 +427,9 @@ def switcher(typelist, pupstr, nests):
                 OUTFILE.write("uintptr_t count, ")
                 OUTFILE.write("yaksuri_cudai_md_s *md, ")
                 OUTFILE.write("int n_threads, ")
-                OUTFILE.write("int n_blocks, ")
+                OUTFILE.write("int n_blocks_x, ")
+                OUTFILE.write("int n_blocks_y, ")
+                OUTFILE.write("int n_blocks_z, ")
                 OUTFILE.write("int device);\n")
 
     OUTFILE.write("\n")

diff --git a/src/backend/cuda/include/yaksuri_cudai.h b/src/backend/cuda/include/yaksuri_cudai.h
@@ -15,8 +15,6 @@
 #define CUDA_P2P_DISABLED (2)
 #define CUDA_P2P_CLIQUES  (3)
 
-#define YAKSURI_CUDAI_THREAD_BLOCK_SIZE  (256)
-
 /* *INDENT-OFF* */
 #ifdef __cplusplus
 extern "C" {
@@ -85,9 +83,9 @@ typedef struct yaksuri_cudai_md_s {
 
 typedef struct yaksuri_cudai_type_s {
     void (*pack) (const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s * md,
-                  int n_threads, int n_blocks, int device);
+                  int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device);
     void (*unpack) (const void *inbuf, void *outbuf, uintptr_t count, yaksuri_cudai_md_s * md,
-                    int n_threads, int n_blocks, int device);
+                    int n_threads, int n_blocks_x, int n_blocks_y, int n_blocks_z, int device);
     yaksuri_cudai_md_s *md;
     pthread_mutex_t mdmutex;
     uintptr_t num_elements;

diff --git a/src/backend/cuda/pup/yaksuri_cudai_pup.c b/src/backend/cuda/pup/yaksuri_cudai_pup.c
@@ -9,6 +9,42 @@
 #include "yaksi.h"
 #include "yaksuri_cudai.h"
 
+#define THREAD_BLOCK_SIZE  (256)
+#define MAX_GRIDSZ_X       ((1ULL << 31) - 1)
+#define MAX_GRIDSZ_Y       (65535)
+#define MAX_GRIDSZ_Z       (65535)
+
+static int get_thread_block_dims(uint64_t count, yaksi_type_s * type, int *n_threads,
+                                 int *n_blocks_x, int *n_blocks_y, int *n_blocks_z)
+{
+    int rc = YAKSA_SUCCESS;
+    yaksuri_cudai_type_s *cuda_type = (yaksuri_cudai_type_s *) type->backend.cuda.priv;
+
+    *n_threads = THREAD_BLOCK_SIZE;
+    uint64_t n_blocks = count * cuda_type->num_elements / THREAD_BLOCK_SIZE;
+    n_blocks += ! !(count * cuda_type->num_elements % THREAD_BLOCK_SIZE);
+
+    if (n_blocks <= MAX_GRIDSZ_X) {
+        *n_blocks_x = (int) n_blocks;
+        *n_blocks_y = 1;
+        *n_blocks_z = 1;
+    } else if (n_blocks <= MAX_GRIDSZ_X * MAX_GRIDSZ_Y) {
+        *n_blocks_x = YAKSU_CEIL(n_blocks, MAX_GRIDSZ_Y);
+        *n_blocks_y = YAKSU_CEIL(n_blocks, (*n_blocks_x));
+        *n_blocks_z = 1;
+    } else {
+        int n_blocks_xy = YAKSU_CEIL(n_blocks, MAX_GRIDSZ_Z);
+        *n_blocks_x = YAKSU_CEIL(n_blocks_xy, MAX_GRIDSZ_Y);
+        *n_blocks_y = YAKSU_CEIL(n_blocks_xy, (*n_blocks_x));
+        *n_blocks_z = YAKSU_CEIL(n_blocks, (uintptr_t) (*n_blocks_x) * (*n_blocks_y));
+    }
+
+  fn_exit:
+    return rc;
+  fn_fail:
+    goto fn_exit;
+}
+
 int yaksuri_cudai_pup_is_supported(yaksi_type_s * type, bool * is_supported)
 {
     int rc = YAKSA_SUCCESS;
@@ -68,9 +104,10 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
         rc = yaksuri_cudai_md_alloc(type);
         YAKSU_ERR_CHECK(rc, fn_fail);
 
-        int n_threads = YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
-        int n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
-        n_blocks += ! !(count * cuda_type->num_elements % YAKSURI_CUDAI_THREAD_BLOCK_SIZE);
+        int n_threads;
+        int n_blocks_x, n_blocks_y, n_blocks_z;
+        rc = get_thread_block_dims(count, type, &n_threads, &n_blocks_x, &n_blocks_y, &n_blocks_z);
+        YAKSU_ERR_CHECK(rc, fn_fail);
 
         if ((inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeManaged) ||
             (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeManaged) ||
@@ -86,7 +123,8 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
             }
 
-            cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, target);
+            cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, n_blocks_y,
+                            n_blocks_z, target);
         } else if (inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeDevice) {
             target = outattr.device;
             cerr = cudaSetDevice(target);
@@ -98,7 +136,8 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
             }
 
-            cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, target);
+            cuda_type->pack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x, n_blocks_y,
+                            n_blocks_z, target);
         } else if ((outattr.type == cudaMemoryTypeDevice && inattr.device != outattr.device) ||
                    (outattr.type == cudaMemoryTypeHost)) {
             assert(inattr.type == cudaMemoryTypeDevice);
@@ -113,8 +152,8 @@ int yaksuri_cudai_ipack(const void *inbuf, void *outbuf, uintptr_t count, yaksi_
                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
             }
 
-            cuda_type->pack(inbuf, device_tmpbuf, count, cuda_type->md, n_threads, n_blocks,
-                            target);
+            cuda_type->pack(inbuf, device_tmpbuf, count, cuda_type->md, n_threads, n_blocks_x,
+                            n_blocks_y, n_blocks_z, target);
             cerr = cudaMemcpyAsync(outbuf, device_tmpbuf, count * type->size, cudaMemcpyDefault,
                                    yaksuri_cudai_global.stream[target]);
             YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
@@ -184,9 +223,10 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
         rc = yaksuri_cudai_md_alloc(type);
         YAKSU_ERR_CHECK(rc, fn_fail);
 
-        int n_threads = YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
-        int n_blocks = count * cuda_type->num_elements / YAKSURI_CUDAI_THREAD_BLOCK_SIZE;
-        n_blocks += ! !(count * cuda_type->num_elements % YAKSURI_CUDAI_THREAD_BLOCK_SIZE);
+        int n_threads;
+        int n_blocks_x, n_blocks_y, n_blocks_z;
+        rc = get_thread_block_dims(count, type, &n_threads, &n_blocks_x, &n_blocks_y, &n_blocks_z);
+        YAKSU_ERR_CHECK(rc, fn_fail);
 
         if ((inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeManaged) ||
             (inattr.type == cudaMemoryTypeManaged && outattr.type == cudaMemoryTypeDevice) ||
@@ -202,7 +242,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
             }
 
-            cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, target);
+            cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
+                              n_blocks_y, n_blocks_z, target);
         } else if (inattr.type == cudaMemoryTypeDevice && outattr.type == cudaMemoryTypeManaged) {
             target = inattr.device;
             cerr = cudaSetDevice(target);
@@ -214,7 +255,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
                 YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
             }
 
-            cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks, target);
+            cuda_type->unpack(inbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
+                              n_blocks_y, n_blocks_z, target);
         } else if ((inattr.type == cudaMemoryTypeDevice && inattr.device != outattr.device) ||
                    (inattr.type == cudaMemoryTypeHost)) {
             assert(outattr.type == cudaMemoryTypeDevice);
@@ -233,8 +275,8 @@ int yaksuri_cudai_iunpack(const void *inbuf, void *outbuf, uintptr_t count, yaks
                                    yaksuri_cudai_global.stream[target]);
             YAKSURI_CUDAI_CUDA_ERR_CHKANDJUMP(cerr, rc, fn_fail);
 
-            cuda_type->unpack(device_tmpbuf, outbuf, count, cuda_type->md, n_threads, n_blocks,
-                              target);
+            cuda_type->unpack(device_tmpbuf, outbuf, count, cuda_type->md, n_threads, n_blocks_x,
+                              n_blocks_y, n_blocks_z, target);
         } else {
             rc = YAKSA_ERR__INTERNAL;
             goto fn_fail;

diff --git a/src/util/yaksu_base.h b/src/util/yaksu_base.h
@@ -8,6 +8,7 @@
 
 #define YAKSU_MAX(x, y)  ((x) > (y) ? (x) : (y))
 #define YAKSU_MIN(x, y)  ((x) < (y) ? (x) : (y))
+#define YAKSU_CEIL(x, y) (((x) / (y)) + !!((x) % (y)))
 
 #define YAKSU_ERR_CHKANDJUMP(check, rc, errcode, label) \
     do {                                                \