Merge branch 'open-mpi:main' into allreduce

open-mpi · Jan 4, 2024 · ab8bcff · ab8bcff
2 parents e937de8 + ec01df5
commit ab8bcff
Show file tree

Hide file tree

Showing 15 changed files with 181 additions and 165 deletions.
diff --git a/3rd-party/openpmix b/3rd-party/openpmix
diff --git a/3rd-party/prrte b/3rd-party/prrte
diff --git a/ompi/mca/coll/accelerator/Makefile.am b/ompi/mca/coll/accelerator/Makefile.am
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2014      The University of Tennessee and The University
+#                         of Tennessee Research Foundation.  All rights
+#                         reserved.
+# Copyright (c) 2014      NVIDIA Corporation.  All rights reserved.
+# Copyright (c) 2017      IBM Corporation.  All rights reserved.
+# $COPYRIGHT$
+#
+# Additional copyrights may follow
+#
+# $HEADER$
+#
+dist_ompidata_DATA = help-mpi-coll-accelerator.txt
+
+sources = coll_accelerator_module.c coll_accelerator_reduce.c coll_accelerator_allreduce.c \
+          coll_accelerator_reduce_scatter_block.c coll_accelerator_component.c \
+          coll_accelerator_scan.c coll_accelerator_exscan.c coll_accelerator.h
+
+# Make the output library in this directory, and name it either
+# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
+# (for static builds).
+
+if MCA_BUILD_ompi_coll_accelerator_DSO
+component_noinst =
+component_install = mca_coll_accelerator.la
+else
+component_noinst = libmca_coll_accelerator.la
+component_install =
+endif
+
+mcacomponentdir = $(ompilibdir)
+mcacomponent_LTLIBRARIES = $(component_install)
+mca_coll_accelerator_la_SOURCES = $(sources)
+mca_coll_accelerator_la_LDFLAGS = -module -avoid-version
+mca_coll_accelerator_la_LIBADD = $(top_builddir)/ompi/lib@[email protected]
+
+noinst_LTLIBRARIES = $(component_noinst)
+libmca_coll_accelerator_la_SOURCES =$(sources)
+libmca_coll_accelerator_la_LDFLAGS = -module -avoid-version
+
diff --git a/ompi/mca/coll/cuda/coll_cuda.h → ompi/mca/coll/accelerator/coll_accelerator.h b/ompi/mca/coll/cuda/coll_cuda.h → ompi/mca/coll/accelerator/coll_accelerator.h
@@ -3,15 +3,16 @@
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2014-2015 NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2024      Triad National Security, LLC. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
  *
  * $HEADER$
  */
 
-#ifndef MCA_COLL_CUDA_EXPORT_H
-#define MCA_COLL_CUDA_EXPORT_H
+#ifndef MCA_COLL_ACCELERATOR_EXPORT_H
+#define MCA_COLL_ACCELERATOR_EXPORT_H
 
 #include "ompi_config.h"
 
@@ -31,43 +32,43 @@ BEGIN_C_DECLS
 
 /* API functions */
 
-int mca_coll_cuda_init_query(bool enable_progress_threads,
+int mca_coll_accelerator_init_query(bool enable_progress_threads,
                              bool enable_mpi_threads);
 mca_coll_base_module_t
-*mca_coll_cuda_comm_query(struct ompi_communicator_t *comm,
+*mca_coll_accelerator_comm_query(struct ompi_communicator_t *comm,
                           int *priority);
 
-int mca_coll_cuda_module_enable(mca_coll_base_module_t *module,
+int mca_coll_accelerator_module_enable(mca_coll_base_module_t *module,
                                 struct ompi_communicator_t *comm);
 
 int
-mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count,
+mca_coll_accelerator_allreduce(const void *sbuf, void *rbuf, int count,
                         struct ompi_datatype_t *dtype,
                         struct ompi_op_t *op,
                         struct ompi_communicator_t *comm,
                         mca_coll_base_module_t *module);
 
-int mca_coll_cuda_reduce(const void *sbuf, void *rbuf, int count,
+int mca_coll_accelerator_reduce(const void *sbuf, void *rbuf, int count,
                          struct ompi_datatype_t *dtype,
                          struct ompi_op_t *op,
                          int root,
                          struct ompi_communicator_t *comm,
                          mca_coll_base_module_t *module);
 
-int mca_coll_cuda_exscan(const void *sbuf, void *rbuf, int count,
+int mca_coll_accelerator_exscan(const void *sbuf, void *rbuf, int count,
                          struct ompi_datatype_t *dtype,
                          struct ompi_op_t *op,
                          struct ompi_communicator_t *comm,
                          mca_coll_base_module_t *module);
 
-int mca_coll_cuda_scan(const void *sbuf, void *rbuf, int count,
+int mca_coll_accelerator_scan(const void *sbuf, void *rbuf, int count,
                        struct ompi_datatype_t *dtype,
                        struct ompi_op_t *op,
                        struct ompi_communicator_t *comm,
                        mca_coll_base_module_t *module);
 
 int
-mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount,
+mca_coll_accelerator_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount,
                                    struct ompi_datatype_t *dtype,
                                    struct ompi_op_t *op,
                                    struct ompi_communicator_t *comm,
@@ -83,7 +84,7 @@ mca_coll_cuda_reduce_scatter_block(const void *sbuf, void *rbuf, int rcount,
  * @retval >0                The buffer belongs to a managed buffer in
  *                           device memory.
  */
-static inline int mca_coll_cuda_check_buf(void *addr)
+static inline int mca_coll_accelerator_check_buf(void *addr)
 {
     uint64_t flags;
     int dev_id;
@@ -94,13 +95,13 @@ static inline int mca_coll_cuda_check_buf(void *addr)
     }
 }
 
-static inline void *mca_coll_cuda_memcpy(void *dest, const void *src, size_t size)
+static inline void *mca_coll_accelerator_memcpy(void *dest, const void *src, size_t size)
 {
     int res;
     res = opal_accelerator.mem_copy(MCA_ACCELERATOR_NO_DEVICE_ID, MCA_ACCELERATOR_NO_DEVICE_ID,
                                     dest, src, size, MCA_ACCELERATOR_TRANSFER_UNSPEC);
     if (res != 0) {
-        opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d", res, dest, src,
+        opal_output(0, "coll/accelerator: Error in mem_copy: res=%d, dest=%p, src=%p, size=%d", res, dest, src,
                     (int) size);
         abort();
     } else {
@@ -111,28 +112,28 @@ static inline void *mca_coll_cuda_memcpy(void *dest, const void *src, size_t siz
 /* Types */
 /* Module */
 
-typedef struct mca_coll_cuda_module_t {
+typedef struct mca_coll_accelerator_module_t {
     mca_coll_base_module_t super;
 
     /* Pointers to all the "real" collective functions */
     mca_coll_base_comm_coll_t c_coll;
-} mca_coll_cuda_module_t;
+} mca_coll_accelerator_module_t;
 
-OBJ_CLASS_DECLARATION(mca_coll_cuda_module_t);
+OBJ_CLASS_DECLARATION(mca_coll_accelerator_module_t);
 
 /* Component */
 
-typedef struct mca_coll_cuda_component_t {
+typedef struct mca_coll_accelerator_component_t {
     mca_coll_base_component_2_4_0_t super;
 
     int priority; /* Priority of this component */
-    int disable_cuda_coll;  /* Force disable of the CUDA collective component */
-} mca_coll_cuda_component_t;
+    int disable_accelerator_coll;  /* Force disable of the accelerator collective component */
+} mca_coll_accelerator_component_t;
 
 /* Globally exported variables */
 
-OMPI_DECLSPEC extern mca_coll_cuda_component_t mca_coll_cuda_component;
+OMPI_DECLSPEC extern mca_coll_accelerator_component_t mca_coll_accelerator_component;
 
 END_C_DECLS
 
-#endif /* MCA_COLL_CUDA_EXPORT_H */
+#endif /* MCA_COLL_ACCELERATOR_EXPORT_H */
diff --git a/ompi/mca/coll/cuda/coll_cuda_allreduce.c → .../accelerator/coll_accelerator_allreduce.c b/ompi/mca/coll/cuda/coll_cuda_allreduce.c → .../accelerator/coll_accelerator_allreduce.c
@@ -4,6 +4,7 @@
  *                         reserved.
  * Copyright (c) 2014-2015 NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2022      Amazon.com, Inc. or its affiliates.  All Rights reserved.
+ * Copyright (c) 2024      Triad National Security, LLC. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -12,7 +13,7 @@
  */
 
 #include "ompi_config.h"
-#include "coll_cuda.h"
+#include "coll_accelerator.h"
 
 #include <stdio.h>
 
@@ -27,20 +28,20 @@
  *	Returns:	- MPI_SUCCESS or error code
  */
 int
-mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count,
+mca_coll_accelerator_allreduce(const void *sbuf, void *rbuf, int count,
                         struct ompi_datatype_t *dtype,
                         struct ompi_op_t *op,
                         struct ompi_communicator_t *comm,
                         mca_coll_base_module_t *module)
 {
-    mca_coll_cuda_module_t *s = (mca_coll_cuda_module_t*) module;
+    mca_coll_accelerator_module_t *s = (mca_coll_accelerator_module_t*) module;
     ptrdiff_t gap;
     char *rbuf1 = NULL, *sbuf1 = NULL, *rbuf2 = NULL;
     size_t bufsize;
     int rc;
 
     bufsize = opal_datatype_span(&dtype->super, count, &gap);
-    rc = mca_coll_cuda_check_buf((void *)sbuf);
+    rc = mca_coll_accelerator_check_buf((void *)sbuf);
     if (rc < 0) {
         return rc;
     }
@@ -49,10 +50,10 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count,
         if (NULL == sbuf1) {
             return OMPI_ERR_OUT_OF_RESOURCE;
         }
-        mca_coll_cuda_memcpy(sbuf1, sbuf, bufsize);
+        mca_coll_accelerator_memcpy(sbuf1, sbuf, bufsize);
         sbuf = sbuf1 - gap;
     }
-    rc = mca_coll_cuda_check_buf(rbuf);
+    rc = mca_coll_accelerator_check_buf(rbuf);
     if (rc < 0) {
         return rc;
     }
@@ -62,7 +63,7 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count,
             if (NULL != sbuf1) free(sbuf1);
             return OMPI_ERR_OUT_OF_RESOURCE;
         }
-        mca_coll_cuda_memcpy(rbuf1, rbuf, bufsize);
+        mca_coll_accelerator_memcpy(rbuf1, rbuf, bufsize);
         rbuf2 = rbuf; /* save away original buffer */
         rbuf = rbuf1 - gap;
     }
@@ -72,7 +73,7 @@ mca_coll_cuda_allreduce(const void *sbuf, void *rbuf, int count,
     }
     if (NULL != rbuf1) {
         rbuf = rbuf2;
-        mca_coll_cuda_memcpy(rbuf, rbuf1, bufsize);
+        mca_coll_accelerator_memcpy(rbuf, rbuf1, bufsize);
         free(rbuf1);
     }
     return rc;

diff --git a/ompi/mca/coll/cuda/coll_cuda_component.c → .../accelerator/coll_accelerator_component.c b/ompi/mca/coll/cuda/coll_cuda_component.c → .../accelerator/coll_accelerator_component.c
@@ -6,6 +6,7 @@
  * Copyright (c) 2014-2015 NVIDIA Corporation.  All rights reserved.
  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
  *                         reserved.
+ * Copyright (c) 2024      Triad National Security, LLC. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -19,25 +20,25 @@
 
 #include "mpi.h"
 #include "ompi/constants.h"
-#include "coll_cuda.h"
+#include "coll_accelerator.h"
 
 /*
- * Public string showing the coll ompi_cuda component version number
+ * Public string showing the coll ompi_accelerator component version number
  */
-const char *mca_coll_cuda_component_version_string =
-    "Open MPI cuda collective MCA component version " OMPI_VERSION;
+const char *mca_coll_accelerator_component_version_string =
+    "Open MPI accelerator collective MCA component version " OMPI_VERSION;
 
 /*
  * Local function
  */
-static int cuda_register(void);
+static int accelerator_register(void);
 
 /*
  * Instantiate the public struct with all of our public information
  * and pointers to our public functions in it
  */
 
-mca_coll_cuda_component_t mca_coll_cuda_component = {
+mca_coll_accelerator_component_t mca_coll_accelerator_component = {
     {
         /* First, the mca_component_t struct containing meta information
          * about the component itself */
@@ -46,12 +47,12 @@ mca_coll_cuda_component_t mca_coll_cuda_component = {
             MCA_COLL_BASE_VERSION_2_4_0,
 
             /* Component name and version */
-            .mca_component_name = "cuda",
+            .mca_component_name = "accelerator",
             MCA_BASE_MAKE_VERSION(component, OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
                                   OMPI_RELEASE_VERSION),
 
             /* Component open and close functions */
-            .mca_register_component_params = cuda_register,
+            .mca_register_component_params = accelerator_register,
         },
         .collm_data = {
             /* The component is checkpoint ready */
@@ -60,32 +61,32 @@ mca_coll_cuda_component_t mca_coll_cuda_component = {
 
         /* Initialization / querying functions */
 
-        .collm_init_query = mca_coll_cuda_init_query,
-        .collm_comm_query = mca_coll_cuda_comm_query,
+        .collm_init_query = mca_coll_accelerator_init_query,
+        .collm_comm_query = mca_coll_accelerator_comm_query,
     },
 
-    /* cuda-specific component information */
+    /* accelerator-specific component information */
 
     /* Priority: make it above all point to point collectives including self */
     .priority = 78,
 };
 
 
-static int cuda_register(void)
+static int accelerator_register(void)
 {
-    (void) mca_base_component_var_register(&mca_coll_cuda_component.super.collm_version,
-                                           "priority", "Priority of the cuda coll component; only relevant if barrier_before or barrier_after is > 0",
+    (void) mca_base_component_var_register(&mca_coll_accelerator_component.super.collm_version,
+                                           "priority", "Priority of the accelerator coll component; only relevant if barrier_before or barrier_after is > 0",
                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
                                            OPAL_INFO_LVL_6,
                                            MCA_BASE_VAR_SCOPE_READONLY,
-                                           &mca_coll_cuda_component.priority);
+                                           &mca_coll_accelerator_component.priority);
 
-    (void) mca_base_component_var_register(&mca_coll_cuda_component.super.collm_version,
-                                           "disable_cuda_coll", "Automatically handle the CUDA buffers for the MPI collective.",
+    (void) mca_base_component_var_register(&mca_coll_accelerator_component.super.collm_version,
+                                           "disable_accelerator_coll", "Automatically handle the accelerator buffers for the MPI collective.",
                                            MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
                                            OPAL_INFO_LVL_2,
                                            MCA_BASE_VAR_SCOPE_READONLY,
-                                           &mca_coll_cuda_component.disable_cuda_coll);
+                                           &mca_coll_accelerator_component.disable_accelerator_coll);
 
     return OMPI_SUCCESS;
 }