Skip to content

Commit 7d87712

Browse files
committed
Improve pack/unpack performance if CUDA/ROCm enabled
Instead of replacing individual memcpy()s with function pointer calls, use the existing checksum-related logic to compile two versions of every pack/unpack function, one without GPU and one with GPU support (using -DOPAL_DATATYPE_PACK_UNPACK_GPU). Signed-off-by: Bart Oldeman <[email protected]>
1 parent 97f88e3 commit 7d87712

7 files changed

+88
-14
lines changed

Diff for: opal/datatype/Makefile.am

+16
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,16 @@ noinst_LTLIBRARIES = \
4646
# these sources will be compiled with the special -D
4747
libdatatype_reliable_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c
4848
libdatatype_reliable_la_CFLAGS = -DCHECKSUM $(AM_CFLAGS)
49+
if OPAL_cuda_support
50+
libdatatype_gpu_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c
51+
libdatatype_gpu_la_CFLAGS = -DOPAL_DATATYPE_PACK_UNPACK_GPU $(AM_CFLAGS)
52+
noinst_LTLIBRARIES += libdatatype_gpu.la
53+
endif
54+
if OPAL_rocm_support
55+
libdatatype_gpu_la_SOURCES = opal_datatype_pack.c opal_datatype_unpack.c
56+
libdatatype_gpu_la_CFLAGS = -DOPAL_DATATYPE_PACK_UNPACK_GPU $(AM_CFLAGS)
57+
noinst_LTLIBRARIES += libdatatype_gpu.la
58+
endif
4959

5060
# these sources will be compiled with the normal CFLAGS only
5161
libdatatype_la_SOURCES = \
@@ -71,6 +81,12 @@ libdatatype_la_SOURCES = \
7181
opal_datatype_unpack.c
7282

7383
libdatatype_la_LIBADD = libdatatype_reliable.la
84+
if OPAL_cuda_support
85+
libdatatype_la_LIBADD += libdatatype_gpu.la
86+
endif
87+
if OPAL_rocm_support
88+
libdatatype_la_LIBADD += libdatatype_gpu.la
89+
endif
7490

7591
# Conditionally install the header files
7692
if WANT_INSTALL_HEADERS

Diff for: opal/datatype/opal_convertor.c

+34-4
Original file line numberDiff line numberDiff line change
@@ -592,7 +592,19 @@ int32_t opal_convertor_prepare_for_recv(opal_convertor_t *convertor,
592592
}
593593
}
594594
} else {
595-
#endif /* defined(CHECKSUM) */
595+
#elif OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
596+
if (OPAL_UNLIKELY(convertor->flags & (CONVERTOR_CUDA | CONVERTOR_ROCM))) {
597+
if (OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS))) {
598+
convertor->fAdvance = opal_unpack_general_gpu;
599+
} else {
600+
if (convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) {
601+
convertor->fAdvance = opal_unpack_homogeneous_contig_gpu;
602+
} else {
603+
convertor->fAdvance = opal_generic_simple_unpack_gpu;
604+
}
605+
}
606+
} else {
607+
#endif /* defined(CHECKSUM) || OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT */
596608
if (OPAL_UNLIKELY(!(convertor->flags & CONVERTOR_HOMOGENEOUS))) {
597609
convertor->fAdvance = opal_unpack_general;
598610
} else {
@@ -602,7 +614,7 @@ int32_t opal_convertor_prepare_for_recv(opal_convertor_t *convertor,
602614
convertor->fAdvance = opal_generic_simple_unpack;
603615
}
604616
}
605-
#if defined(CHECKSUM)
617+
#if defined(CHECKSUM) || OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
606618
}
607619
#endif
608620
return OPAL_SUCCESS;
@@ -643,7 +655,25 @@ int32_t opal_convertor_prepare_for_send(opal_convertor_t *convertor,
643655
}
644656
}
645657
} else {
646-
#endif /* defined(CHECKSUM) */
658+
#elif OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
659+
if (convertor->flags & (CONVERTOR_CUDA | CONVERTOR_ROCM)) {
660+
if (CONVERTOR_SEND_CONVERSION
661+
== (convertor->flags & (CONVERTOR_SEND_CONVERSION | CONVERTOR_HOMOGENEOUS))) {
662+
convertor->fAdvance = opal_pack_general_gpu;
663+
} else {
664+
if (datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) {
665+
if (((datatype->ub - datatype->lb) == (ptrdiff_t) datatype->size)
666+
|| (1 >= convertor->count)) {
667+
convertor->fAdvance = opal_pack_homogeneous_contig_gpu;
668+
} else {
669+
convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_gpu;
670+
}
671+
} else {
672+
convertor->fAdvance = opal_generic_simple_pack_gpu;
673+
}
674+
}
675+
} else {
676+
#endif /* defined(CHECKSUM) || OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT */
647677
if (CONVERTOR_SEND_CONVERSION
648678
== (convertor->flags & (CONVERTOR_SEND_CONVERSION | CONVERTOR_HOMOGENEOUS))) {
649679
convertor->fAdvance = opal_pack_general;
@@ -659,7 +689,7 @@ int32_t opal_convertor_prepare_for_send(opal_convertor_t *convertor,
659689
convertor->fAdvance = opal_generic_simple_pack;
660690
}
661691
}
662-
#if defined(CHECKSUM)
692+
#if defined(CHECKSUM) || OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
663693
}
664694
#endif
665695
return OPAL_SUCCESS;

Diff for: opal/datatype/opal_datatype_pack.c

+6
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@
4949
opal_pack_homogeneous_contig_with_gaps_checksum
5050
# define opal_generic_simple_pack_function opal_generic_simple_pack_checksum
5151
# define opal_pack_general_function opal_pack_general_checksum
52+
#elif defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
53+
# define opal_pack_homogeneous_contig_function opal_pack_homogeneous_contig_gpu
54+
# define opal_pack_homogeneous_contig_with_gaps_function \
55+
opal_pack_homogeneous_contig_with_gaps_gpu
56+
# define opal_generic_simple_pack_function opal_generic_simple_pack_gpu
57+
# define opal_pack_general_function opal_pack_general_gpu
5258
#else
5359
# define opal_pack_homogeneous_contig_function opal_pack_homogeneous_contig
5460
# define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps

Diff for: opal/datatype/opal_datatype_pack.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
#include "opal_config.h"
2424
#include "opal/datatype/opal_datatype_pack_unpack_predefined.h"
2525

26-
#if !defined(CHECKSUM) && (OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT)
26+
#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
2727
/* Make use of existing macro to do CUDA style memcpy */
2828
# undef MEMCPY_CSUM
2929
# define MEMCPY_CSUM(DST, SRC, BLENGTH, CONVERTOR) \
@@ -105,16 +105,16 @@ static inline void pack_predefined_data(opal_convertor_t *CONVERTOR, const dt_el
105105
/* premptively update the number of COUNT we will return. */
106106
*(COUNT) -= cando_count;
107107

108+
#if !defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
108109
if (_elem->blocklen < 9) {
109-
if ( !(CONVERTOR->flags & CONVERTOR_CUDA) &&
110-
!(CONVERTOR->flags & CONVERTOR_ROCM)
111-
&& OPAL_LIKELY(
110+
if (OPAL_LIKELY(
112111
OPAL_SUCCESS
113112
== opal_datatype_pack_predefined_element(&_memory, &_packed, cando_count, _elem))) {
114113
goto update_and_return;
115114
}
116115
/* else unrecognized _elem->common.type, use the memcpy path */
117116
}
117+
#endif
118118

119119
if (_elem->blocklen == 1) {
120120
for (; cando_count > 0; cando_count--) {

Diff for: opal/datatype/opal_datatype_prototypes.h

+18
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ OPAL_DECLSPEC int32_t opal_unpack_general(opal_convertor_t *pConvertor, struct i
3030
uint32_t *out_size, size_t *max_data);
3131
OPAL_DECLSPEC int32_t opal_unpack_general_checksum(opal_convertor_t *pConvertor, struct iovec *iov,
3232
uint32_t *out_size, size_t *max_data);
33+
#if OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
34+
OPAL_DECLSPEC int32_t opal_pack_general_gpu(opal_convertor_t *pConvertor, struct iovec *iov,
35+
uint32_t *out_size, size_t *max_data);
36+
OPAL_DECLSPEC int32_t opal_unpack_general_gpu(opal_convertor_t *pConvertor, struct iovec *iov,
37+
uint32_t *out_size, size_t *max_data);
38+
#endif
3339

3440
/*
3541
* Now the internal functions
@@ -54,6 +60,18 @@ int32_t opal_generic_simple_unpack(opal_convertor_t *pConvertor, struct iovec *i
5460
uint32_t *out_size, size_t *max_data);
5561
int32_t opal_generic_simple_unpack_checksum(opal_convertor_t *pConvertor, struct iovec *iov,
5662
uint32_t *out_size, size_t *max_data);
63+
#if OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
64+
int32_t opal_pack_homogeneous_contig_gpu(opal_convertor_t *pConv, struct iovec *iov,
65+
uint32_t *out_size, size_t *max_data);
66+
int32_t opal_pack_homogeneous_contig_with_gaps_gpu(opal_convertor_t *pConv, struct iovec *iov,
67+
uint32_t *out_size, size_t *max_data);
68+
int32_t opal_generic_simple_pack_gpu(opal_convertor_t *pConvertor, struct iovec *iov,
69+
uint32_t *out_size, size_t *max_data);
70+
int32_t opal_unpack_homogeneous_contig_gpu(opal_convertor_t *pConv, struct iovec *iov,
71+
uint32_t *out_size, size_t *max_data);
72+
int32_t opal_generic_simple_unpack_gpu(opal_convertor_t *pConvertor, struct iovec *iov,
73+
uint32_t *out_size, size_t *max_data);
74+
#endif
5775

5876
END_C_DECLS
5977

Diff for: opal/datatype/opal_datatype_unpack.c

+6-2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@
4949
# define opal_unpack_general_function opal_unpack_general_checksum
5050
# define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_checksum
5151
# define opal_generic_simple_unpack_function opal_generic_simple_unpack_checksum
52+
#elif defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
53+
# define opal_unpack_general_function opal_unpack_general_gpu
54+
# define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_gpu
55+
# define opal_generic_simple_unpack_function opal_generic_simple_unpack_gpu
5256
#else
5357
# define opal_unpack_general_function opal_unpack_general
5458
# define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig
@@ -217,7 +221,7 @@ opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_
217221
MEMCPY( temporary + start_position, partial_data, length );
218222

219223
/* Save the original content of the user memory */
220-
#if OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
224+
#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
221225
/* In the case where the data is being unpacked from device memory, need to
222226
* use the special host to device memory copy. */
223227
pConvertor->cbmemcpy(saved_data, user_data, data_length, pConvertor );
@@ -235,7 +239,7 @@ opal_unpack_partial_predefined(opal_convertor_t *pConvertor, const dt_elem_desc_
235239

236240
/* Rebuild the data by pulling back the unmodified bytes from the original
237241
* content in the user memory. */
238-
#if OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT
242+
#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
239243
/* Need to copy the modified user_data again so we can see which
240244
* bytes need to be converted back to their original values. */
241245
{

Diff for: opal/datatype/opal_datatype_unpack.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
#include "opal_config.h"
2323
#include "opal/datatype/opal_datatype_pack_unpack_predefined.h"
2424

25-
#if !defined(CHECKSUM) && (OPAL_CUDA_SUPPORT || OPAL_ROCM_SUPPORT)
25+
#if defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
2626
/* Make use of existing macro to do CUDA style memcpy */
2727
# undef MEMCPY_CSUM
2828
# define MEMCPY_CSUM(DST, SRC, BLENGTH, CONVERTOR) \
@@ -102,16 +102,16 @@ static inline void unpack_predefined_data(opal_convertor_t *CONVERTOR, const dt_
102102
/* preemptively update the number of COUNT we will return. */
103103
*(COUNT) -= cando_count;
104104

105+
#if !defined(OPAL_DATATYPE_PACK_UNPACK_GPU)
105106
if (_elem->blocklen < 9) {
106-
if ( !(CONVERTOR->flags & CONVERTOR_CUDA) &&
107-
!(CONVERTOR->flags & CONVERTOR_ROCM)
108-
&& OPAL_LIKELY(OPAL_SUCCESS
107+
if (OPAL_LIKELY(OPAL_SUCCESS
109108
== opal_datatype_unpack_predefined_element(&_packed, &_memory,
110109
cando_count, _elem))) {
111110
goto update_and_return;
112111
}
113112
/* else unrecognized _elem->common.type, use the memcpy path */
114113
}
114+
#endif
115115

116116
if (1 == _elem->blocklen) { /* Do as many full blocklen as possible */
117117
for (; cando_count > 0; cando_count--) {

0 commit comments

Comments
 (0)