Improve pack/unpack performance if CUDA/ROCm enabled

bartoldeman · bartoldeman · commit 7d87712be51d · 2022-05-10T09:50:01.000-04:00
Instead of replacing individual memcpy()s with function pointer
calls, use the existing checksum-related logic to compile two
versions of every pack/unpack function, one without GPU and one with
GPU support (using -DOPAL_DATATYPE_PACK_UNPACK_GPU).

Signed-off-by: Bart Oldeman &lt;bart.oldeman@calculquebec.ca&gt;
diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
@@ -46,6 +46,16 @@ noinst_LTLIBRARIES = \
 # these sources will be compiled with the special -D
 libdatatype_reliable_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c
 libdatatype_reliable_la_CFLAGS = -DCHECKSUM $(AM_CFLAGS)
+if OPAL_cuda_support
+libdatatype_gpu_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c
+libdatatype_gpu_la_CFLAGS = -DOPAL_DATATYPE_PACK_UNPACK_GPU $(AM_CFLAGS)
+noinst_LTLIBRARIES += libdatatype_gpu.la
+endif
+if OPAL_rocm_support
+libdatatype_gpu_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c
+libdatatype_gpu_la_CFLAGS = -DOPAL_DATATYPE_PACK_UNPACK_GPU $(AM_CFLAGS)
+noinst_LTLIBRARIES += libdatatype_gpu.la
+endif
 
 # these sources will be compiled with the normal CFLAGS only
 libdatatype_la_SOURCES = \
@@ -71,6 +81,12 @@ libdatatype_la_SOURCES = \
         opal_datatype_unpack.c
 
 libdatatype_la_LIBADD = libdatatype_reliable.la
+if OPAL_cuda_support
+libdatatype_la_LIBADD += libdatatype_gpu.la
+endif
+if OPAL_rocm_support
+libdatatype_la_LIBADD += libdatatype_gpu.la
+endif
 
 # Conditionally install the header files
 if WANT_INSTALL_HEADERS
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
@@ -592,7 +592,19 @@ int32_t opal_convertor_prepare_for_recv(opal_convertor_t *convertor,
             }
         }
     } else {
-#endif /* defined(CHECKSUM) */
+#elif OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
+    if (OPAL_UNLIKELY(convertor->flags & (CONVERTOR_CUDA | CONVERTOR_ROCM))) {
+        if (OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS))) {
+            convertor->fAdvance = opal_unpack_general_gpu;
+        } else {
+            if (convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) {
+                convertor->fAdvance = opal_unpack_homogeneous_contig_gpu;
+            } else {
+                convertor->fAdvance = opal_generic_simple_unpack_gpu;
+            }
+        }
+    } else {
+#endif /* defined(CHECKSUM) || OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT */
         if (OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS))) {
             convertor->fAdvance = opal_unpack_general;
         } else {
@@ -602,7 +614,7 @@ int32_t opal_convertor_prepare_for_recv(opal_convertor_t *convertor,
                 convertor->fAdvance = opal_generic_simple_unpack;
             }
         }
-#if defined(CHECKSUM)
+#if defined(CHECKSUM) || OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
     }
 #endif
     return OPAL_SUCCESS;
@@ -643,7 +655,25 @@ int32_t opal_convertor_prepare_for_send(opal_convertor_t *convertor,
             }
         }
     } else {
-#endif /* defined(CHECKSUM) */
+#elif OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
+    if (convertor->flags & (CONVERTOR_CUDA | CONVERTOR_ROCM)) {
+        if (CONVERTOR_SEND_CONVERSION
+            == (convertor->flags & (CONVERTOR_SEND_CONVERSION | CONVERTOR_HOMOGENEOUS))) {
+            convertor->fAdvance = opal_pack_general_gpu;
+        } else {
+            if (datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) {
+                if (((datatype->ub - datatype->lb) == (ptrdiff_t) datatype->size)
+                    || (1 >= convertor->count)) {
+                    convertor->fAdvance = opal_pack_homogeneous_contig_gpu;
+                } else {
+                    convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_gpu;
+                }
+            } else {
+                convertor->fAdvance = opal_generic_simple_pack_gpu;
+            }
+        }
+    } else {
+#endif /* defined(CHECKSUM) || OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT */
         if (CONVERTOR_SEND_CONVERSION
             == (convertor->flags & (CONVERTOR_SEND_CONVERSION | CONVERTOR_HOMOGENEOUS))) {
             convertor->fAdvance = opal_pack_general;
@@ -659,7 +689,7 @@ int32_t opal_convertor_prepare_for_send(opal_convertor_t *convertor,
                 convertor->fAdvance = opal_generic_simple_pack;
             }
         }
-#if defined(CHECKSUM)
+#if defined(CHECKSUM) || OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
     }
 #endif
     return OPAL_SUCCESS;
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
@@ -49,6 +49,12 @@
         opal_pack_homogeneous_contig_with_gaps_checksum
 #    define opal_generic_simple_pack_function opal_generic_simple_pack_checksum
 #    define opal_pack_general_function        opal_pack_general_checksum
+#elif defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
+#    define opal_pack_homogeneous_contig_function opal_pack_homogeneous_contig_gpu
+#    define opal_pack_homogeneous_contig_with_gaps_function \
+        opal_pack_homogeneous_contig_with_gaps_gpu
+#    define opal_generic_simple_pack_function opal_generic_simple_pack_gpu
+#    define opal_pack_general_function        opal_pack_general_gpu
 #else
 #    define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig
 #    define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps
diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
@@ -23,7 +23,7 @@
 #include "opal_config.h"
 #include "opal/datatype/opal_datatype_pack_unpack_predefined.h"
 
-#if !defined(CHECKSUM) && (OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT)
+#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
 /* Make use of existing macro to do CUDA style memcpy */
 #    undef MEMCPY_CSUM
 #    define MEMCPY_CSUM(DST, SRC, BLENGTH, CONVERTOR) \
@@ -105,16 +105,16 @@ static inline void pack_predefined_data(opal_convertor_t *CONVERTOR, const dt_el
     /* premptively update the number of COUNT we will return. */
     *(COUNT) -= cando_count;
 
+#if !defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
     if (_elem->blocklen < 9) {
-        if ( !(CONVERTOR->flags & CONVERTOR_CUDA) &&
-             !(CONVERTOR->flags & CONVERTOR_ROCM)
-            && OPAL_LIKELY(
+        if (OPAL_LIKELY(
                 OPAL_SUCCESS
                 == opal_datatype_pack_predefined_element(&_memory, &_packed, cando_count, _elem))) {
             goto update_and_return;
         }
         /* else unrecognized _elem->common.type, use the memcpy path */
     }
+#endif
 
     if (_elem->blocklen == 1) {
         for (; cando_count > 0; cando_count--) {
diff --git a/opal/datatype/opal_datatype_prototypes.h b/opal/datatype/opal_datatype_prototypes.h
@@ -30,6 +30,12 @@ OPAL_DECLSPEC int32_t opal_unpack_general(opal_convertor_t *pConvertor, struct i
                                           uint32_t *out_size, size_t *max_data);
 OPAL_DECLSPEC int32_t opal_unpack_general_checksum(opal_convertor_t *pConvertor, struct iovec *iov,
                                                    uint32_t *out_size, size_t *max_data);
+#if OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
+OPAL_DECLSPEC int32_t opal_pack_general_gpu(opal_convertor_t *pConvertor, struct iovec *iov,
+                                             uint32_t *out_size, size_t *max_data);
+OPAL_DECLSPEC int32_t opal_unpack_general_gpu(opal_convertor_t *pConvertor, struct iovec *iov,
+                                               uint32_t *out_size, size_t *max_data);
+#endif
 
 /*
  * Now the internal functions
@@ -54,6 +60,18 @@ int32_t opal_generic_simple_unpack(opal_convertor_t *pConvertor, struct iovec *i
                                    uint32_t *out_size, size_t *max_data);
 int32_t opal_generic_simple_unpack_checksum(opal_convertor_t *pConvertor, struct iovec *iov,
                                             uint32_t *out_size, size_t *max_data);
+#if OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
+int32_t opal_pack_homogeneous_contig_gpu(opal_convertor_t *pConv, struct iovec *iov,
+                                          uint32_t *out_size, size_t *max_data);
+int32_t opal_pack_homogeneous_contig_with_gaps_gpu(opal_convertor_t *pConv, struct iovec *iov,
+                                                    uint32_t *out_size, size_t *max_data);
+int32_t opal_generic_simple_pack_gpu(opal_convertor_t *pConvertor, struct iovec *iov,
+                                      uint32_t *out_size, size_t *max_data);
+int32_t opal_unpack_homogeneous_contig_gpu(opal_convertor_t *pConv, struct iovec *iov,
+                                            uint32_t *out_size, size_t *max_data);
+int32_t opal_generic_simple_unpack_gpu(opal_convertor_t *pConvertor, struct iovec *iov,
+                                        uint32_t *out_size, size_t *max_data);
+#endif
 
 END_C_DECLS
 
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
@@ -49,6 +49,10 @@
 #    define opal_unpack_general_function            opal_unpack_general_checksum
 #    define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_checksum
 #    define opal_generic_simple_unpack_function     opal_generic_simple_unpack_checksum
+#elif defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
+#    define opal_unpack_general_function            opal_unpack_general_gpu
+#    define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_gpu
+#    define opal_generic_simple_unpack_function     opal_generic_simple_unpack_gpu
 #else
 #    define opal_unpack_general_function            opal_unpack_general
 #    define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig
@@ -217,7 +221,7 @@ opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_
     MEMCPY( temporary + start_position, partial_data, length );
 
     /* Save the original content of the user memory */
-#if OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
+#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
     /* In the case where the data is being unpacked from device memory, need to
      * use the special host to device memory copy. */
     pConvertor->cbmemcpy(saved_data, user_data, data_length, pConvertor );
@@ -235,7 +239,7 @@ opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_
 
     /* Rebuild the data by pulling back the unmodified bytes from the original
      * content in the user memory. */
-#if OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
+#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
     /* Need to copy the modified user_data again so we can see which
      * bytes need to be converted back to their original values. */
     {
diff --git a/opal/datatype/opal_datatype_unpack.h b/opal/datatype/opal_datatype_unpack.h
@@ -22,7 +22,7 @@
 #include "opal_config.h"
 #include "opal/datatype/opal_datatype_pack_unpack_predefined.h"
 
-#if !defined(CHECKSUM) && (OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT)
+#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
 /* Make use of existing macro to do CUDA style memcpy */
 #    undef MEMCPY_CSUM
 #    define MEMCPY_CSUM(DST, SRC, BLENGTH, CONVERTOR) \
@@ -102,16 +102,16 @@ static inline void unpack_predefined_data(opal_convertor_t *CONVERTOR, const dt_
     /* preemptively update the number of COUNT we will return. */
     *(COUNT) -= cando_count;
 
+#if !defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
     if (_elem->blocklen < 9) {
-        if ( !(CONVERTOR->flags & CONVERTOR_CUDA) &&
-             !(CONVERTOR->flags & CONVERTOR_ROCM)
-            && OPAL_LIKELY(OPAL_SUCCESS
+        if (OPAL_LIKELY(OPAL_SUCCESS
                            == opal_datatype_unpack_predefined_element(&_packed, &_memory,
                                                                       cando_count, _elem))) {
             goto update_and_return;
         }
         /* else unrecognized _elem->common.type, use the memcpy path */
     }
+#endif
 
     if (1 == _elem->blocklen) {  /* Do as many full blocklen as possible */
         for (; cando_count > 0; cando_count--) {