open-mpi · devreal · Apr 25, 2024 · Jun 19, 2024 · Jun 19, 2024 · Jun 19, 2024
diff --git a/.github/workflows/compile-cuda.yaml b/.github/workflows/compile-cuda.yaml
@@ -24,5 +24,5 @@ jobs:
     - name: Build Open MPI
       run: |
         ./autogen.pl
-        ./configure --prefix=${PWD}/install --with-cuda=${CUDA_PATH} --with-cuda-libdir=${CUDA_PATH}/lib64/stubs
+        ./configure --prefix=${PWD}/install --with-cuda=${CUDA_PATH} --with-cuda-libdir=${CUDA_PATH}/lib64/stubs --enable-nvcc NVCC=/usr/local/cuda/bin/nvcc
         make -j
diff --git a/.github/workflows/compile-rocm.yaml b/.github/workflows/compile-rocm.yaml
@@ -20,12 +20,12 @@ jobs:
         echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/5.7.1 jammy main" | sudo tee --append /etc/apt/sources.list.d/rocm.list
         echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
         sudo apt update
-        sudo apt install -y rocm-hip-runtime
+        sudo apt install -y rocm-hip-runtime hipcc
     - uses: actions/checkout@v4
       with:
             submodules: recursive
     - name: Build Open MPI
       run: |
         ./autogen.pl
-        ./configure --prefix=${PWD}/install --with-rocm=/opt/rocm --disable-mpi-fortran
+        ./configure --prefix=${PWD}/install --with-rocm=/opt/rocm --disable-mpi-fortran --enable-hipcc HIPCC=/opt/rocm-5.7.1/bin/hipcc
         LD_LIBRARY_PATH=/opt/rocm/lib make -j
diff --git a/config/opal_check_cudart.m4 b/config/opal_check_cudart.m4
@@ -0,0 +1,125 @@
+dnl -*- autoconf -*-
+dnl
+dnl Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
+dnl                         University Research and Technology
+dnl                         Corporation.  All rights reserved.
+dnl Copyright (c) 2004-2005 The University of Tennessee and The University
+dnl                         of Tennessee Research Foundation.  All rights
+dnl                         reserved.
+dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+dnl                         University of Stuttgart.  All rights reserved.
+dnl Copyright (c) 2004-2005 The Regents of the University of California.
+dnl                         All rights reserved.
+dnl Copyright (c) 2006-2016 Cisco Systems, Inc.  All rights reserved.
+dnl Copyright (c) 2007      Sun Microsystems, Inc.  All rights reserved.
+dnl Copyright (c) 2009      IBM Corporation.  All rights reserved.
+dnl Copyright (c) 2009      Los Alamos National Security, LLC.  All rights
+dnl                         reserved.
+dnl Copyright (c) 2009-2011 Oak Ridge National Labs.  All rights reserved.
+dnl Copyright (c) 2011-2015 NVIDIA Corporation.  All rights reserved.
+dnl Copyright (c) 2015      Research Organization for Information Science
+dnl                         and Technology (RIST). All rights reserved.
+dnl Copyright (c) 2022      Amazon.com, Inc. or its affiliates.  All Rights reserved.
+dnl $COPYRIGHT$
+dnl
+dnl Additional copyrights may follow
+dnl
+dnl $HEADER$
+dnl
+
+
+# OPAL_CHECK_CUDART(prefix, [action-if-found], [action-if-not-found])
+# --------------------------------------------------------
+# check if CUDA runtime library support can be found.  sets prefix_{CPPFLAGS,
+# LDFLAGS, LIBS} as needed and runs action-if-found if there is
+# support, otherwise executes action-if-not-found
+
+#
+# Check for CUDA support
+#
+AC_DEFUN([OPAL_CHECK_CUDART],[
+OPAL_VAR_SCOPE_PUSH([cudart_save_CPPFLAGS cudart_save_LDFLAGS cudart_save_LIBS])
+
+cudart_save_CPPFLAGS="$CPPFLAGS"
+cudart_save_LDFLAGS="$LDFLAGS"
+cudart_save_LIBS="$LIBS"
+
+#
+# Check to see if the user provided paths for CUDART
+#
+AC_ARG_WITH([cudart],
+            [AS_HELP_STRING([--with-cudart=DIR],
+            [Path to the CUDA runtime library and header files])])
+AC_MSG_CHECKING([if --with-cudart is set])
+AC_ARG_WITH([cudart-libdir],
+            [AS_HELP_STRING([--with-cudart-libdir=DIR],
+                            [Search for CUDA runtime libraries in DIR])])
+
+####################################
+#### Check for CUDA runtime library
+####################################
+AS_IF([test "x$with_cudart" = "xno" || test "x$with_cudart" = "x"],
+      [opal_check_cudart_happy=no
+       AC_MSG_RESULT([not set (--with-cudart=$with_cudart)])],
+      [AS_IF([test ! -d "$with_cudart"],
+             [AC_MSG_RESULT([not found])
+              AC_MSG_WARN([Directory $with_cudart not found])],
+             [OPAL_FLAGS_APPEND_UNIQ([CPPFLAGS], [-I$with_cudart/include])
+              AC_CHECK_HEADERS([cuda_runtime.h],
+                               [opal_check_cudart_happy=yes
+                                opal_cudart_incdir="$with_cudart/include"]
+                               [AC_MSG_RESULT([not found])
+                                AC_MSG_WARN([Could not find cuda_runtime.h in $with_cudart/include])])])])
+CPPFLAGS=${cudart_save_CPPFLAGS}
+
+# try include path relative to nvcc
+AS_IF([test "$opal_check_cudart_happy" = "no" && test "$with_cudart" != "no"],
+      [AC_PATH_PROG([nvcc_bin], [nvcc], ["not-found"])
+       AS_IF([test "$nvcc_bin" = "not-found"],
+             [AC_MSG_WARN([Could not find nvcc binary])],
+             [nvcc_dirname=`AS_DIRNAME([$nvcc_bin])`
+              OPAL_FLAGS_APPEND_UNIQ([CPPFLAGS], [-I$nvcc_dirname/../include])
+              AC_CHECK_HEADERS([cuda_runtime.h],
+                               [opal_check_cudart_happy=yes,
+                                with_cudart=$nvcc_dirname/../
+                                opal_cudart_incdir="$with_cudart/include"])])],
+      [])
+CPPFLAGS=${cudart_save_CPPFLAGS}
+
+AS_IF([test x"$with_cudart_libdir" = "x"],
+      [with_cudart_libdir=$with_cudart/lib64/],
+      [])
+
+AS_IF([test "$opal_check_cudart_happy" = "yes"],
+    [OAC_CHECK_PACKAGE([cudart],
+                       [$1],
+                       [cuda_runtime.h],
+                       [cudart],
+                       [cudaMalloc],
+                       [opal_check_cudart_happy="yes"],
+                       [opal_check_cudart_happy="no"])],
+    [])
+
+
+AC_MSG_CHECKING([if have cuda runtime library support])
+if test "$opal_check_cudart_happy" = "yes"; then
+    AC_MSG_RESULT([yes (-I$opal_cudart_incdir)])
+    CUDART_SUPPORT=1
+    common_cudart_CPPFLAGS="-I$opal_cudart_incdir"
+    AC_SUBST([common_cudart_CPPFLAGS])
+else
+    AC_MSG_RESULT([no])
+    CUDART_SUPPORT=0
+fi
+
+
+OPAL_SUMMARY_ADD([Accelerators], [CUDART support], [], [$opal_check_cudart_happy])
+AM_CONDITIONAL([OPAL_cudart_support], [test "x$CUDART_SUPPORT" = "x1"])
+AC_DEFINE_UNQUOTED([OPAL_CUDART_SUPPORT],$CUDART_SUPPORT,
+                   [Whether we have cuda runtime library support])
+
+CPPFLAGS=${cudart_save_CPPFLAGS}
+LDFLAGS=${cudart_save_LDFLAGS}
+LIBS=${cudart_save_LIBS}
+OPAL_VAR_SCOPE_POP
+])dnl
diff --git a/config/opal_check_hipcc.m4 b/config/opal_check_hipcc.m4
@@ -0,0 +1,48 @@
+dnl -*- autoconf -*-
+dnl
+dnl Copyright (c) 2024      Stony Brook University.  All rights reserved.
+dnl
+dnl $COPYRIGHT$
+dnl
+dnl Additional copyrights may follow
+dnl
+dnl $HEADER$
+dnl
+
+dnl
+dnl Check for HIPCC and bail out if HIPCC was requested
+dnl Options provided:
+dnl   --with-hipcc[=path/to/hipcc]: provide a path to HIPCC
+dnl   --enable-hipcc: require HIPCC, bail out if not found
+dnl
+
+AC_DEFUN([OPAL_CHECK_HIPCC],[
+
+    AC_ARG_ENABLE([hipcc],
+        [AS_HELP_STRING([--enable-hipcc],
+            [Force configure to fail if hipcc is not found (hipcc is used to build HIP operator support).])])
+
+    AC_ARG_WITH([hipcc],
+        [AS_HELP_STRING([--with-hipcc=DIR],
+            [Path to the HIP compiler])])
+
+    AS_IF([test -n "$with_hipcc"],
+          [HIPCC=$with_hipcc])
+    AS_IF([test -z "$HIPCC"],
+          # try to find hipcc in PATH
+          [AC_PATH_PROG([HIPCC], [hipcc], [])])
+
+    # disable support if explicitly specified
+    AS_IF([test "$enable_hipcc" = "no"],
+          [HIPCC=])
+
+    AS_IF([test -z "$HIPCC" && test "$enable_hipcc" = "yes"],
+          [AC_MSG_WARN([A suitable HIP compiler was not found, but --enable-hipcc=yes was specified])
+           AC_MSG_ERROR([Cannot continue])])
+
+    OPAL_SUMMARY_ADD([Accelerators], [HIPCC compiler], [], [$HIPCC (flags: $HIPCCFLAGS)])
+
+    AC_ARG_VAR([HIPCC], [AMD HIP compiler])
+    AC_ARG_VAR([HIPCCFLAGS], [AMD HIP compiler flags])
+
+])
diff --git a/config/opal_check_nvcc.m4 b/config/opal_check_nvcc.m4
@@ -0,0 +1,56 @@
+dnl -*- autoconf -*-
+dnl
+dnl Copyright (c) 2024      Stony Brook University.  All rights reserved.
+dnl
+dnl $COPYRIGHT$
+dnl
+dnl Additional copyrights may follow
+dnl
+dnl $HEADER$
+dnl
+
+dnl
+dnl Check for NVCC and bail out if NVCC was requested
+dnl Options provided:
+dnl   --with-nvcc[=path/to/nvcc]: provide a path to NVCC
+dnl   --enable-nvcc: require NVCC, bail out if not found
+dnl   --nvcc-compute-arch: request a specific compute
+dnl                        architecture for the operator
+dnl                        kernels
+dnl
+
+AC_DEFUN([OPAL_CHECK_NVCC],[
+
+    AC_ARG_ENABLE([nvcc],
+        [AS_HELP_STRING([--enable-nvcc],
+            [Force configure to fail if CUDA nvcc is not found (CUDA nvcc is used to build CUDA operator support).])])
+
+    AC_ARG_WITH([nvcc],
+        [AS_HELP_STRING([--with-nvcc=DIR],
+            [Path to the CUDA compiler])])
+
+    AS_IF([test -n "$with_nvcc"],
+          [NVCC=$with_nvcc])
+    AS_IF([test -z "$NVCC"],
+          # try to find nvcc in PATH
+          [AC_PATH_PROG([NVCC], [nvcc], [])])
+
+    # disable ussage of NVCC if explicitly specified
-    # disable ussage of NVCC if explicitly specified
+    # disable usage of NVCC if explicitly specified
-    # disable ussage of NVCC if explicitly specified
+    # disable usage of NVCC if explicitly specified
+    AS_IF([test "$enable_nvcc" = "no"],
+          [NVCC=])
+
+    # prepend C++17 standard, allow override by user
+    AS_IF([test -n "$NVCCFLAGS"],
+          [NVCCFLAGS=--std c++17 $NVCCFLAGS],
+          [NVCCFLAGS=--std c++17])
+
+    AS_IF([test -z "$NVCC" && test "$enable_nvcc" = "yes"],
+          [AC_MSG_WARN([A suitable CUDA compiler was not found, but --enable-nvcc=yes was specified])
+           AC_MSG_ERROR([Cannot continue])])
+
+    OPAL_SUMMARY_ADD([Accelerators], [NVCC compiler], [], [$NVCC (flags: $NVCCFLAGS)])
+
+    AC_ARG_VAR([NVCC], [NVIDIA CUDA compiler])
+    AC_ARG_VAR([NVCCFLAGS], [NVIDIA CUDA compiler flags])
+
+])
diff --git a/config/opal_mca.m4 b/config/opal_mca.m4
@@ -186,7 +186,7 @@ of type-component pairs.  For example, --enable-mca-no-build=pml-ob1])
     else
        msg=
        if test -z "$enable_mca_dso"; then
-           enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze,btl-smcuda,rcache-gpusm,rcache-rgpusm"
+           enable_mca_dso="accelerator-cuda,accelerator-rocm,accelerator-ze,btl-smcuda,rcache-gpusm,rcache-rgpusm,op-cuda,op-rocm"
            msg="(default)"
        fi
        DSO_all=0

diff --git a/ompi/mca/op/base/op_base_frame.c b/ompi/mca/op/base/op_base_frame.c
@@ -2,7 +2,7 @@
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2005 The University of Tennessee and The University
+ * Copyright (c) 2004-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -42,6 +42,7 @@ static void module_constructor(ompi_op_base_module_t *m)
 {
     m->opm_enable = NULL;
     m->opm_op = NULL;
+    m->opm_device_enabled = false;
     memset(&(m->opm_fns), 0, sizeof(m->opm_fns));
     memset(&(m->opm_3buff_fns), 0, sizeof(m->opm_3buff_fns));
 }
@@ -50,6 +51,7 @@ static void module_constructor_1_0_0(ompi_op_base_module_1_0_0_t *m)
 {
     m->opm_enable = NULL;
     m->opm_op = NULL;
+    m->opm_device_enabled = false;
     memset(&(m->opm_fns), 0, sizeof(m->opm_fns));
     memset(&(m->opm_3buff_fns), 0, sizeof(m->opm_3buff_fns));
 }

diff --git a/ompi/mca/op/base/op_base_op_select.c b/ompi/mca/op/base/op_base_op_select.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2009 The University of Tennessee and The University
+ * Copyright (c) 2004-2023 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -152,22 +152,50 @@ int ompi_op_base_op_select(ompi_op_t *op)
         }
 
         /* Copy over the non-NULL pointers */
-        for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
-            /* 2-buffer variants */
-            if (NULL != avail->ao_module->opm_fns[i]) {
-                OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
-                op->o_func.intrinsic.fns[i] = avail->ao_module->opm_fns[i];
-                op->o_func.intrinsic.modules[i] = avail->ao_module;
-                OBJ_RETAIN(avail->ao_module);
+        if (avail->ao_module->opm_device_enabled) {
+            if (NULL == op->o_device_op) {
+                op->o_device_op = calloc(1, sizeof(*op->o_device_op));
             }
-
-            /* 3-buffer variants */
-            if (NULL != avail->ao_module->opm_3buff_fns[i]) {
-                OBJ_RELEASE(op->o_3buff_intrinsic.modules[i]);
-                op->o_3buff_intrinsic.fns[i] =
-                    avail->ao_module->opm_3buff_fns[i];
-                op->o_3buff_intrinsic.modules[i] = avail->ao_module;
-                OBJ_RETAIN(avail->ao_module);
+            for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+                /* 2-buffer variants */
+                if (NULL != avail->ao_module->opm_stream_fns[i]) {
+                    if (NULL != op->o_device_op->do_intrinsic.modules[i]) {
+                        OBJ_RELEASE(op->o_device_op->do_intrinsic.modules[i]);
+                    }
+                    op->o_device_op->do_intrinsic.fns[i] = avail->ao_module->opm_stream_fns[i];
+                    op->o_device_op->do_intrinsic.modules[i] = avail->ao_module;
+                    OBJ_RETAIN(avail->ao_module);
+                }
+
+                /* 3-buffer variants */
+                if (NULL != avail->ao_module->opm_3buff_stream_fns[i]) {
+                    if (NULL != op->o_device_op->do_3buff_intrinsic.modules[i]) {
+                        OBJ_RELEASE(op->o_device_op->do_3buff_intrinsic.modules[i]);
+                    }
+                    op->o_device_op->do_3buff_intrinsic.fns[i] =
+                        avail->ao_module->opm_3buff_stream_fns[i];
+                    op->o_device_op->do_3buff_intrinsic.modules[i] = avail->ao_module;
+                    OBJ_RETAIN(avail->ao_module);
+                }
+            }
+        } else {
+            for (i = 0; i < OMPI_OP_BASE_TYPE_MAX; ++i) {
+                /* 2-buffer variants */
+                if (NULL != avail->ao_module->opm_fns[i]) {
+                    OBJ_RELEASE(op->o_func.intrinsic.modules[i]);
+                    op->o_func.intrinsic.fns[i] = avail->ao_module->opm_fns[i];
+                    op->o_func.intrinsic.modules[i] = avail->ao_module;
+                    OBJ_RETAIN(avail->ao_module);
+                }
+
+                /* 3-buffer variants */
+                if (NULL != avail->ao_module->opm_3buff_fns[i]) {
+                    OBJ_RELEASE(op->o_3buff_intrinsic.modules[i]);
+                    op->o_3buff_intrinsic.fns[i] =
+                        avail->ao_module->opm_3buff_fns[i];
+                    op->o_3buff_intrinsic.modules[i] = avail->ao_module;
+                    OBJ_RETAIN(avail->ao_module);
+                }
             }
         }