diff --git a/.github/workflows/compile-rocm.yaml b/.github/workflows/compile-rocm.yaml index 7c98e1a5916..cf4ad932032 100644 --- a/.github/workflows/compile-rocm.yaml +++ b/.github/workflows/compile-rocm.yaml @@ -17,7 +17,7 @@ jobs: curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg echo 'deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/debian focal main' | sudo tee /etc/apt/sources.list.d/rocm.list sudo apt-get update - sudo apt-get install -y rocm-hip-sdk + sudo apt-get install -y rocm-hip-runtime - uses: actions/checkout@v3 with: submodules: recursive @@ -26,3 +26,9 @@ jobs: ./autogen.pl ./configure --prefix=${PWD}/install --with-rocm=/opt/rocm --disable-mpi-fortran make -j + - name: Clean up + run: | + ls -la ./ + rm -rf ./* + rm -rf ./.??* + ls -la ./ \ No newline at end of file diff --git a/.gitignore b/.gitignore index d15a1bc8f88..c1bfe01444a 100644 --- a/.gitignore +++ b/.gitignore @@ -534,3 +534,12 @@ docs/_templates # Common Python virtual environment directory names venv py?? + +# Copies of PRRTE RST files (i.e., not source controlled in this tree) +docs/prrte-rst-content +docs/schizo-ompi-rst-content + +# Copies of the built HTML docs and man pages (for distribution +# tarballs) +docs/html +docs/man diff --git a/.mailmap b/.mailmap index e8516075720..b463497a038 100644 --- a/.mailmap +++ b/.mailmap @@ -32,6 +32,7 @@ Jeff Squyres Jeff Squyres --quiet <--quiet> Jeff Squyres +Jeff Squyres George Bosilca diff --git a/.readthedocs-pre-create-environment.sh b/.readthedocs-pre-create-environment.sh new file mode 100755 index 00000000000..2709b822b80 --- /dev/null +++ b/.readthedocs-pre-create-environment.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -euxo pipefail + +# The ReadTheDocs build process does not run autogen/configure/make. +# Hence, we have to copy the PRRTE RST files (from the 3rd-party/prrte +# tree) to our docs/ tree manually. + +# Ensure that we're in the RTD CI environment + +if [[ "${READTHEDOCS:-no}" == "no" ]]; then + echo "This script is only intended to be run in the ReadTheDocs CI environment" + exit 1 +fi + +SCHIZO_SRC_DIR=3rd-party/prrte/src/mca/schizo/ompi +SCHIZO_TARGET_DIR=docs/schizo-ompi-rst-content + +PRRTE_RST_SRC_DIR=3rd-party/prrte/src/docs/prrte-rst-content +PRRTE_RST_TARGET_DIR=docs/prrte-rst-content + +# Copy the OMPI schizo file from PRRTE + +cp -rp $SCHIZO_SRC_DIR $SCHIZO_TARGET_DIR + +# Only copy the PRRTE RST source files in prrte-rst-content that are +# referenced by ".. include::" in the schizo-ompi-cli.rst file. We do +# this because Sphinx complains if there are .rst files that are not +# referenced. :-( + +mkdir -p $PRRTE_RST_TARGET_DIR +files=`fgrep '.. include::' $SCHIZO_TARGET_DIR/schizo-ompi-cli.rstxt | awk '{ print $3 }'` +for file in $files; do + filename=`basename $file` + cp -pf $PRRTE_RST_SRC_DIR/$filename $PRRTE_RST_TARGET_DIR +done diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 44e0bbac5a7..2ba1fc07842 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -12,6 +12,11 @@ build: os: ubuntu-22.04 tools: python: "3.10" + jobs: + # RTD doesn't run configure or make. So we have to manually copy + # in the PRRTE RST files to docs/. + pre_create_environment: + - ./.readthedocs-pre-create-environment.sh python: install: @@ -21,3 +26,6 @@ python: sphinx: configuration: docs/conf.py fail_on_warning: true + +submodules: + include: all diff --git a/3rd-party/prrte b/3rd-party/prrte index 0347baa1eda..9015ca02cce 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 0347baa1edaec29c4f0cf1eac7b674ad7ba139c1 +Subproject commit 9015ca02cce72acc03f86d399f939843c42b3dc8 diff --git a/Makefile.ompi-rules b/Makefile.ompi-rules index 567bcfd99f3..d18d49c4978 100644 --- a/Makefile.ompi-rules +++ b/Makefile.ompi-rules @@ -2,6 +2,7 @@ # Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2020 Intel, Inc. All rights reserved. +# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -26,6 +27,14 @@ OMPI_V_GEN = $(ompi__v_GEN_$V) ompi__v_GEN_ = $(ompi__v_GEN_$AM_DEFAULT_VERBOSITY) ompi__v_GEN_0 = @echo " GENERATE" $@; +OMPI_V_COPYALL = $(ompi__v_COPYALL_$V) +ompi__v_COPYALL_ = $(ompi__v_COPYALL_$AM_DEFAULT_VERBOSITY) +ompi__v_COPYALL_0 = @echo " COPY tree $@"; + +OMPI_V_SPHINX_COPYRST = $(ompi__v_SPHINX_COPYRST_$V) +ompi__v_SPHINX_COPYRST_ = $(ompi__v_SPHINX_COPYRST_$AM_DEFAULT_VERBOSITY) +ompi__v_SPHINX_COPYRST_0 = @echo " COPY RST source files"; + OMPI_V_SPHINX_HTML = $(ompi__v_SPHINX_HTML_$V) ompi__v_SPHINX_HTML_ = $(ompi__v_SPHINX_HTML_$AM_DEFAULT_VERBOSITY) ompi__v_SPHINX_HTML_0 = @echo " GENERATE HTML docs"; diff --git a/autogen.pl b/autogen.pl index 5af4704f2a1..3cb79025dbf 100755 --- a/autogen.pl +++ b/autogen.pl @@ -1643,6 +1643,10 @@ sub replace_config_sub_guess { if (! -f "3rd-party/prrte/configure.ac") { my_die("Could not find pmix files\n"); } + + verbose "Patching prrte.spec file\n"; + system("$patch_prog -N -p0 < ./config/prrte.spec.diff > /dev/null 2>&1"); + push(@subdirs, "3rd-party/prrte/"); $m4 .= "m4_define([package_prrte], [1])\n"; diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 75aeb93e26e..01e39aaf968 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -107,7 +107,9 @@ AC_DEFUN([OMPI_CHECK_UCX],[ UCP_ATOMIC_FETCH_OP_FXOR, UCP_PARAM_FIELD_ESTIMATED_NUM_PPN, UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK, - UCP_OP_ATTR_FLAG_MULTI_SEND], + UCP_OP_ATTR_FLAG_MULTI_SEND, + UCS_MEMORY_TYPE_RDMA, + UCP_MEM_MAP_SYMMETRIC_RKEY], [], [], [#include ]) AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS], @@ -123,7 +125,8 @@ AC_DEFUN([OMPI_CHECK_UCX],[ [#include ]) AC_CHECK_DECLS([ucp_tag_send_nbx, ucp_tag_send_sync_nbx, - ucp_tag_recv_nbx], + ucp_tag_recv_nbx, + ucp_rkey_compare], [], [], [#include ]) AC_CHECK_TYPES([ucp_request_param_t], diff --git a/config/ompi_setup_prrte.m4 b/config/ompi_setup_prrte.m4 index 4dffa6ceb2a..97eba7a1bd2 100644 --- a/config/ompi_setup_prrte.m4 +++ b/config/ompi_setup_prrte.m4 @@ -19,6 +19,7 @@ dnl Copyright (c) 2019-2020 Intel, Inc. All rights reserved. dnl Copyright (c) 2020-2022 Amazon.com, Inc. or its affiliates. All Rights reserved. dnl Copyright (c) 2021 Nanook Consulting. All rights reserved. dnl Copyright (c) 2021-2022 IBM Corporation. All rights reserved. +dnl Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. dnl $COPYRIGHT$ dnl dnl Additional copyrights may follow @@ -35,10 +36,25 @@ dnl dnl A Makefile conditional OMPI_WANT_PRRTE will be defined based on the dnl results of the build. AC_DEFUN([OMPI_SETUP_PRRTE],[ - OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy]) + AC_REQUIRE([AC_PROG_LN_S]) + +OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy target_rst_dir]) opal_show_subtitle "Configuring PRRTE" + # We *must* have setup Sphinx before invoking this macro (i.e., it + # is a programming error -- not a run-time error -- if Sphinx was + # not previously setup). + OAC_ASSERT_BEFORE([OAC_SETUP_SPHINX], [OMPI_SETUP_PRRTE]) + + # These are sym links to folders with PRRTE's RST files that we'll + # slurp into mpirun.1.rst. We'll remove these links (or even + # accidental full copies) now and replace them with new links to + # the PRRTE that we find, below. + target_rst_dir="$OMPI_TOP_BUILDDIR/docs" + rm -rf "$target_rst_dir/prrte-rst-content" + rm -rf "$target_rst_dir/schizo-ompi-rst-content" + OPAL_3RDPARTY_WITH([prrte], [prrte], [package_prrte], [1]) AC_ARG_WITH([prrte-bindir], @@ -101,12 +117,15 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ [$OMPI_USING_INTERNAL_PRRTE], [Whether or not we are using the internal PRRTE]) - OPAL_SUMMARY_ADD([Miscellaneous], [prrte], [], [$opal_prrte_mode]) + AC_SUBST(OMPI_PRRTE_RST_CONTENT_DIR) + AC_SUBST(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR) + AM_CONDITIONAL(OMPI_HAVE_PRRTE_RST, [test $OMPI_HAVE_PRRTE_RST -eq 1]) + + OPAL_SUMMARY_ADD([Miscellaneous], [PRRTE], [], [$opal_prrte_mode]) OPAL_VAR_SCOPE_POP ]) - dnl _OMPI_SETUP_PRRTE_INTERNAL([action-if-success], [action-if-not-success]) dnl dnl Attempt to configure the built-in PRRTE. @@ -220,7 +239,15 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_INTERNAL], [ [AC_MSG_ERROR([PRRTE configuration failed. Cannot continue.])]) AS_IF([test "$internal_prrte_happy" = "yes"], - [$1], [$2]) + [AC_MSG_CHECKING([for internal PRRTE RST files]) + AS_IF([test -n "$SPHINX_BUILD"], + [OMPI_HAVE_PRRTE_RST=1 + OMPI_PRRTE_RST_CONTENT_DIR="$OMPI_TOP_SRCDIR/3rd-party/prrte/src/docs/prrte-rst-content" + OMPI_SCHIZO_OMPI_RST_CONTENT_DIR="$OMPI_TOP_SRCDIR/3rd-party/prrte/src/mca/schizo/ompi" + AC_MSG_RESULT([found])], + [AC_MSG_RESULT([not found])]) + $1], + [$2]) OPAL_VAR_SCOPE_POP ]) @@ -284,9 +311,27 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [ [AC_DEFINE_UNQUOTED([OMPI_PRTERUN_PATH], ["${prterun_path}"], [Path to prterun])]) AS_IF([test "$setup_prrte_external_happy" = "yes"], - [$1], [$2]) + [ # Determine if this external PRRTE has installed the RST + # directories that we care about + + AC_MSG_CHECKING([for external PRRTE RST files]) + prrte_install_dir=${with_prrte}/share/prte/rst + AS_IF([test -n "$SPHINX_BUILD"], + [AS_IF([test -d "$prrte_install_dir/prrte-rst-content" && \ + test -d "$prrte_install_dir/schizo-ompi-rst-content"], + [OMPI_HAVE_PRRTE_RST=1 + OMPI_PRRTE_RST_CONTENT_DIR="$prrte_install_dir/prrte-rst-content" + OMPI_SCHIZO_OMPI_RST_CONTENT_DIR="$prrte_install_dir/schizo-ompi-rst-content" + AC_MSG_RESULT([found]) + ], + [ # This version of PRRTE doesn't have installed RST + # files. + AC_MSG_RESULT([not found]) + OMPI_HAVE_PRRTE_RST=0 + ]) + ]) + $1], + [$2]) OPAL_VAR_SCOPE_POP ]) - - diff --git a/config/prrte.spec.diff b/config/prrte.spec.diff new file mode 100644 index 00000000000..4e8b1a86eb1 --- /dev/null +++ b/config/prrte.spec.diff @@ -0,0 +1,20 @@ +--- 3rd-party/prrte/contrib/dist/linux/prrte.spec 2023-10-03 08:12:43.842625000 -0400 ++++ 3rd-party/prrte/contrib/dist/linux/prrte.spec 2023-10-03 08:12:27.849686000 -0400 +@@ -612,7 +612,7 @@ + %{shell_scripts_path}/%{shell_scripts_basename}.sh + %{shell_scripts_path}/%{shell_scripts_basename}.csh + %endif +-%doc README INSTALL LICENSE ++%doc README.md LICENSE + + %else + +@@ -656,7 +656,7 @@ + %{shell_scripts_path}/%{shell_scripts_basename}.sh + %{shell_scripts_path}/%{shell_scripts_basename}.csh + %endif +-%doc README INSTALL LICENSE ++%doc README.md LICENSE + %{_pkgdatadir} + + %files devel -f devel.files diff --git a/configure.ac b/configure.ac index 7c3c3936c3b..f03bdaf268c 100644 --- a/configure.ac +++ b/configure.ac @@ -28,6 +28,7 @@ # Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. # Copyright (c) 2019 Triad National Security, LLC. All rights # reserved. +# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -1072,7 +1073,7 @@ AS_IF([test -z "$LEX" || \ dnl Note that we have to double escape the URL below dnl so that the # it contains doesn't confuse the Autotools -OAC_SETUP_SPHINX([$srcdir/docs/_build/man/MPI_T.3], +OAC_SETUP_SPHINX([$srcdir/docs/man/MPI_T.3], [[https://docs.open-mpi.org/en/main/developers/prerequisites.html#sphinx-and-therefore-python]]) # diff --git a/contrib/ompi_cplusplus.txt b/contrib/ompi_cplusplus.txt index a61994b0e69..35f2c95e36a 100644 --- a/contrib/ompi_cplusplus.txt +++ b/contrib/ompi_cplusplus.txt @@ -132,7 +132,6 @@ ./opal/util/few.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/keyval_parse.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/malloc.h: defined(c_plusplus) defined(__cplusplus) -./opal/util/opal_pty.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/os_path.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/qsort.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/show_help_lex.h: defined(c_plusplus) defined(__cplusplus) diff --git a/docs/Makefile.am b/docs/Makefile.am index 3aa2b3b960f..eacf2baf9b8 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -1,6 +1,7 @@ # # Copyright (c) 2022 Cisco Systems, Inc. All rights reserved. # +# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -26,7 +27,7 @@ .NOTPARALLEL: OUTDIR = _build -SPHINX_CONFIG = conf.py +SPHINX_CONFIG = $(srcdir)/conf.py SPHINX_OPTS ?= -W --keep-going -j auto # Note: it is significantly more convenient to list all the source @@ -58,6 +59,9 @@ RST_SOURCE_FILES = \ EXTRA_DIST = \ requirements.txt \ + no-prrte-content.rst.txt \ + html \ + man \ $(SPHINX_CONFIG) \ $(TEXT_SOURCE_FILES) \ $(IMAGE_SOURCE_FILES) \ @@ -784,27 +788,48 @@ OSHMEM_MAN3 = \ MAN_OUTDIR = $(OUTDIR)/man +# If we're building the docs, then we install from the just-built +# docs. Otherwise, we install from the pre-built docs (i.e., the docs +# included in the tarball). +# +# NOTE: If we're in a git clone with a) no pre-built docs and b) +# Sphinx is not found, then both OPAL_BUILD_DOCS and OPAL_INSTALL_DOCS +# will be false, and the value of MAN_INSTALL_FROM will not not used. +if OPAL_BUILD_DOCS +MAN_INSTALL_FROM = $(MAN_OUTDIR) +HTML_INSTALL_FROM = $(OUTDIR)/html +else +MAN_INSTALL_FROM = man +HTML_INSTALL_FROM = html +endif + +# For each of the man page macros below: +# +# *_RST: the .rst source files +# *_BUILT: the files in the _build/man directory +# *_INSTALL_FROM: the files in either the _build/man/ directory (if we +# are building the Sphinx docs) or the man/ directory (if we are not +# building the Sphinx docs, and are using the pre-built docs that +# are included in the tarballl). OMPI_MAN1_RST = $(OMPI_MAN1:%.1=man-openmpi/man1/%.1.rst) OMPI_MAN1_BUILT = $(OMPI_MAN1:%.1=$(MAN_OUTDIR)/%.1) +OMPI_MAN1_INSTALL_FROM = $(OMPI_MAN1:%.1=$(MAN_INSTALL_FROM)/%.1) OMPI_MAN3_RST = $(OMPI_MAN3:%.3=man-openmpi/man3/%.3.rst) OMPI_MAN3_BUILT = $(OMPI_MAN3:%.3=$(MAN_OUTDIR)/%.3) +OMPI_MAN3_INSTALL_FROM = $(OMPI_MAN3:%.3=$(MAN_INSTALL_FROM)/%.3) OMPI_MAN7_RST = $(OMPI_MAN7:%.7=man-openmpi/man7/%.7.rst) OMPI_MAN7_BUILT = $(OMPI_MAN7:%.7=$(MAN_OUTDIR)/%.7) +OMPI_MAN7_INSTALL_FROM = $(OMPI_MAN7:%.7=$(MAN_INSTALL_FROM)/%.7) OSHMEM_MAN1_RST = $(OSHMEM_MAN1:%.1=man-oshmem/man1/%.1.rst) OSHMEM_MAN1_BUILT = $(OSHMEM_MAN1:%.1=$(MAN_OUTDIR)/%.1) +OSHMEM_MAN1_INSTALL_FROM = $(OSHMEM_MAN1:%.1=$(MAN_INSTALL_FROM)/%.1) OSHMEM_MAN3_RST = $(OSHMEM_MAN3:%.3=man-oshmem/man3/%.3.rst) OSHMEM_MAN3_BUILT = $(OSHMEM_MAN3:%.3=$(MAN_OUTDIR)/%.3) - -EXTRA_DIST += \ - $(OMPI_MAN1_BUILT) \ - $(OMPI_MAN3_BUILT) \ - $(OMPI_MAN7_BUILT) \ - $(OSHMEM_MAN1_BUILT) \ - $(OSHMEM_MAN3_BUILT) +OSHMEM_MAN3_INSTALL_FROM = $(OSHMEM_MAN3:%.3=$(MAN_INSTALL_FROM)/%.3) ########################################################################### @@ -845,49 +870,201 @@ EXTRA_DIST += \ $(OSHMEM_MAN1_CXX_REDIRECTS) \ $(OSHMEM_MAN1_FORTRAN_REDIRECTS) + +########################################################################### + +ALL_MAN_BUILT = \ + $(OMPI_MAN1_BUILT) $(OMPI_MAN3_BUILT) $(OMPI_MAN7_BUILT) \ + $(OSHMEM_MAN1_BUILT) $(OSHMEM_MAN_3_BUILT) + +# These 2 targets are used in EXTRA_DIST: we make a full copy of the +# built HTML and man docs into a separate location that is included in +# the tarball. This gives users a fully copy of the docs included in +# distribution tarballs. +html: $(ALL_MAN_BUILT) + $(OMPI_V_COPYALL) rm -rf html; cp -rp $(OUTDIR)/html . + +man: $(ALL_MAN_BUILT) + $(OMPI_V_COPYALL) rm -rf man; cp -rp $(OUTDIR)/man . + +# Remove the copies of the built HTML and man pages to get back to a +# clean git clone. +maintainer-clean-local: + rm -rf html man + +# If we're doing a VPATH build, we may have "html" and "man" +# directories in the build tree (e.g., if we did "make dist"). Remove +# these copies so that we can pass distcheck (of course: we never +# remove these directories from the source tree). +distclean-local: + if test "$(srcdir)" != "$(builddir)"; then \ + rm -rf html man; \ + fi + ########################################################################### if OPAL_BUILD_DOCS include $(top_srcdir)/Makefile.ompi-rules -# Have to not list these targets in EXTRA_DIST outside of the -# OPAL_BUILD_DOCS conditional because "make dist" will fail due to -# these missing targets (and therefore not run the "dist-hook" target -# in the top-level Makefile, which prints a pretty message about why -# "make dist" failed). +# Copy over the PRRTE RST files to this build tree. # -# We list the entire directory trees (html and man) to grab all -# generated files in them. -EXTRA_DIST += \ - $(OUTDIR)/html \ - $(OUTDIR)/man +# 1. If we're building with PRRTE support: +# +# 1a. If we're building the internal/bundled PRRTE, then we'll copy +# the internal/bundled PRRTE's RST files to the build tree. +# 1b. If we're building against an external PRRTE installation that +# has RST files in its install tree, then we'll copy that +# external PRRTE's RST files to the build tree. +# 1c. If we're building against an external PRRTE installation that +# does NOT have RST files in its install tree, then we'll +# create some dummy RST files instead. +# +# 2. If we're building without PRRTE support, we'll create some dummy +# RST files instead. +# +# NOTE: We specifically list $(builddir) in the target name, just to +# ensure that "make" doesn't accidentally find this directory in the +# VPATH srcdir, and therefore not execute this rule (because Sphinx +# does not understand VPATH, and will ignore this directory in the +# VPATH srcdir). We can have this directory in the srcdir by doing a +# VPATH build of an official distribution tarball. -ALL_MAN_BUILT = \ - $(OMPI_MAN1_BUILT) $(OMPI_MAN3_BUILT) $(OMPI_MAN7_BUILT) \ - $(OSHMEM_MAN1_BUILT) $(OSHMEM_MAN_3_BUILT) +# Make the 2 directories that we need: schizo-ompi-rst-content and +# prrte-rst-content. +$(builddir)/schizo-ompi-rst-content: + $(OMPI_V_MKDIR) if test ! -d "$@"; then mkdir "$@"; fi +$(builddir)/prrte-rst-content: + $(OMPI_V_MKDIR) if test ! -d "$@"; then mkdir "$@"; fi + +# Get the schizo-ompi-rst-cli.rst file that we need. CAVEAT: we name +# it ".in" so that Sphinx doesn't slurp it in via two different +# locations in the RST docroot (i.e., via +# /schizo-ompi-rst-content/schizo-ompi-cli.rstxt and via +# /man-openmpi/man1/mpirun.1.rst). Sphinx *shouldn't* do this -- it +# should see the ".. include...." directive in mpirun.1.rst and *only* +# include the file once. But somehow it's also seeing it a 2nd time. +# So -- fine. We'll name it something other than .rst so that Sphinx +# doesn't do that. +# +# Regardless, either copy this file from the PRRTE install tree or +# make a bogus one (if we don't have one in the PRRTE install tree). +# +# Also, note: the rule to make the $(builddir)/schizo-ompi-rst-content +# directory must be in the AM_CONDITIONAL here, otherwise Automake +# complains. Meaning: we have to have same dependency listed in both +# the "if" and the "else" blocks. Grumble. +if OMPI_HAVE_PRRTE_RST +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(builddir)/schizo-ompi-rst-content +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR)/* + $(OMPI_V_SPHINX_COPYRST) \ + dir=`dirname $@`; \ + cp -rpf $(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR)/* "$$dir" +else +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(builddir)/schizo-ompi-rst-content +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(srcdir)/no-prrte-content.rst.txt + dir=`dirname $@`; if test ! -d "$$dir"; then mkdir "$$dir"; fi + $(OMPI_V_SPHINX_COPYRST) \ + cp -pf $(srcdir)/no-prrte-content.rst.txt "$@" +endif + +$(ALL_MAN_BUILT): $(builddir)/prrte-rst-content +$(ALL_MAN_BUILT): $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt $(ALL_MAN_BUILT): $(RST_SOURCE_FILES) $(IMAGE_SOURCE_FILES) $(ALL_MAN_BUILT): $(TEXT_SOURCE_FILES) $(SPHINX_CONFIG) +# Render the RST source into both 1) full HTML docs and 2) nroff man +# pages. +# # List both commands (HTML and man) in a single rule because they # really need to be run in serial. Specifically, if they were two # different rules and someone ran "make -j", then both of them could # be writing to $(OUTDIR)/doctrees simultaneously, which would be Bad. # Use one of the man pages as a sentinel file to indicate whether all # the HTML docs and man pages have been built. +# +# It's therefore a little bit of a lie to have the target named +# $(ALL_MAN_BUILT) *also* generate all the HTML content, but... so be +# it. +# +# Also note that Open MPI's RST includes some conditional RST (from +# PRRTE -- i.e., whether we get the source RST from the internal +# PRRTE, an external PRRTE, or whether we create RST files from +# scratch). These conditionals mean that we have to make some changes +# to the input Sphinx RST tree before building it. But -- by Automake +# convention -- we can't modify the source tree. Hence, we have to +# copy over all the source RST files -- including its internal +# directory structure -- to the build tree, and then make our desired +# changes here in the build tree. This is a bit ugly, but we could +# not think of anything better to do. +# +# NOTE: This is a little gross in that for a VPATH build, we *always* +# copy from the source tree to the dest tree (if the target does not +# exist or any of the sources in the source tree -- thanks to +# make/VPATH handling -- have changed compared to the target). +# However, we're using "cp -p", so even though we're copying *all the +# sources* from the source tree to the build tree, the timestamp will +# reflect what is in the source tree. Hence, if the source file has +# not changed, then it won't look like the file in the build tree has +# changed. We're going to overwrite any local changes in the build +# tree, but you shouldn't be editing the build tree, anyway. So -- +# good enough. +# +# Finally, one added wrinkle: only copy the RST source files in +# prrte-rst-content that are referenced by ".. include::" in the +# schizo-ompi-cli.rstxt file. We do this because Sphinx complains if +# there are .rst files that are not referenced. :-( $(ALL_MAN_BUILT): - $(OMPI_V_SPHINX_HTML) $(SPHINX_BUILD) -M html "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) - $(OMPI_V_SPHINX_MAN) $(SPHINX_BUILD) -M man "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(OMPI_V_SPHINX_COPYRST) if test "$(srcdir)" != "$(builddir)"; then \ + len=`echo "$(srcdir)/" | wc -c`; \ + for file in $(RST_SOURCE_FILES) $(IMAGE_SOURCE_FILES) $(TEXT_SOURCE_FILES) $(SPHINX_CONFIG); do \ + dir=`dirname $$file | cut -c$$len-`; \ + if test -z "$$dir"; then \ + dir=.; \ + fi; \ + if test ! -d "$$dir"; then \ + mkdir -p "$$dir"; \ + fi; \ + cp -p "$$file" "$$dir"; \ + done; \ + fi; \ + for file in `fgrep '.. include::' $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt | awk '{ print $$3 }'`; do \ + filename=`basename $$file`; \ + cp -pf $(OMPI_PRRTE_RST_CONTENT_DIR)/$$filename "$(builddir)/prrte-rst-content"; \ + done + $(OMPI_V_SPHINX_HTML) OMPI_VERSION_FILE=$(top_srcdir)/VERSION $(SPHINX_BUILD) -M html "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(OMPI_V_SPHINX_MAN) OMPI_VERSION_FILE=$(top_srcdir)/VERSION $(SPHINX_BUILD) -M man "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) # A useful rule to invoke manually to ensure that all of the external # HTML links we have are valid. Running this rule requires # connectivity to the general internet. linkcheck: - $(SPHINX_BUILD) -M linkcheck "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(SPHINX_BUILD) -M linkcheck "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) .PHONY: linkcheck -maintainer-clean-local: - $(SPHINX_BUILD) -M clean "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) +# Since we are building the docs, we built $(OUTDIR). Hence, we need +# to delete it during "make clean". Note that we can't add +# directories to CLEANFILES, because Automake only (effectively) does +# "rm -f $(CLEANFILES)" (not "rm -rf ..."). So we have to delete +# directories ourselves. +# +# Also, if this is a VPATH build, then we made a copy of a bunch of +# RST source files to the build tree. So delete all of those, too. +clean-local: + rm -rf $(OUTDIR) + rm -rf prrte-rst-content schizo-ompi-rst-content + if test "$(srcdir)" != "$(builddir)"; then \ + len=`echo "$(srcdir)/" | wc -c`; \ + for file in $(RST_SOURCE_FILES) $(IMAGE_SOURCE_FILES) $(TEXT_SOURCE_FILES) $(SPHINX_CONFIG); do \ + dir=`dirname $$file | cut -c$$len-`; \ + if test -z "$$dir"; then \ + rm -rf `basename $$file`; \ + fi; \ + if test -n "$$dir" && test -d "$$dir"; then \ + rm -rf "$$dir"; \ + fi; \ + done; \ + fi # List all the built man pages here in the Automake BUILT_SOURCES # macro. This hooks into the normal Automake build mechanisms, and @@ -901,7 +1078,7 @@ endif OPAL_BUILD_DOCS if OPAL_INSTALL_DOCS man1_MANS = \ - $(OMPI_MAN1_BUILT) \ + $(OMPI_MAN1_INSTALL_FROM) \ $(OMPI_MAN1_C_REDIRECTS) if OMPI_HAVE_CXX_COMPILER man1_MANS += $(OMPI_MAN1_CXX_REDIRECTS) @@ -913,12 +1090,12 @@ if OMPI_WANT_JAVA_BINDINGS man1_MANS += $(OMPI_MAN1_JAVA_REDIRECTS) endif -man3_MANS = $(OMPI_MAN3_BUILT) -man7_MANS = $(OMPI_MAN7_BUILT) +man3_MANS = $(OMPI_MAN3_INSTALL_FROM) +man7_MANS = $(OMPI_MAN7_INSTALL_FROM) if PROJECT_OSHMEM man1_MANS += \ - $(OSHMEM_MAN1_BUILT) \ + $(OSHMEM_MAN1_INSTALL_FROM) \ $(OSHMEM_MAN1_C_REDIRECTS) # There is no OSHMEM equivalent of this conditional; just use the OMPI # conditional. @@ -929,7 +1106,7 @@ if OSHMEM_BUILD_FORTRAN_BINDINGS man1_MANS += $(OSHMEM_MAN1_FORTRAN_REDIRECTS) endif -man3_MANS += $(OSHMEM_MAN3_BUILT) +man3_MANS += $(OSHMEM_MAN3_INSTALL_FROM) endif # We do not know the names of all the generated HTML files: we only @@ -945,19 +1122,29 @@ endif # Automake-provided install macros to set desirable permissions on the # target directories and files. # -# Since this might be a VPATH build, first check to see if _build/html -# exists in the source tree. If not, do the find+install from the -# build tree. +# Check to see if we actually built the docs. If we did, copy from +# the _build/html tree in the builddir. In all other cases, see if +# there's a _build/html in the source tree (e.g., if this is a build +# from a tarball that included a _build/html); if that exists, copy +# from that. +# +# NOTE: We can't use the AM_CONDITIONAL OPAL_BUILD_DOCS in the middle +# of a block that uses the shell continuation character at the end of +# each line. Instead, we check if $(SPHINX_BUILD) is non-empty, which +# is the test used to construct OPAL_BUILD_DOCS. install-data-hook: $(MKDIR_P) $(DESTDIR)$(docdir) - if test -d $(srcdir)/_build/html; then \ - topdir=$(srcdir)/_build; \ - else \ - topdir=_build; \ + topdir= ; \ + if test -n "$(SPHINX_BUILD)" && test -d $(builddir)/$(HTML_INSTALL_FROM); then \ + topdir="$(builddir)/$(HTML_INSTALL_FROM)"; \ + elif test -d $(srcdir)/$(HTML_INSTALL_FROM); then \ + topdir="$(srcdir)/$(HTML_INSTALL_FROM)"; \ fi; \ - cd $$topdir; \ - find html -type d -exec $(mkinstalldirs) $(DESTDIR)$(docdir)/{} \; ; \ - find html -type f -exec $(INSTALL_DATA) {} $(DESTDIR)$(docdir)/{} \; + if test -n "$$topdir"; then \ + cd $$topdir/..; \ + find html -type d -exec $(mkinstalldirs) $(DESTDIR)$(docdir)/{} \; ; \ + find html -type f -exec $(INSTALL_DATA) {} $(DESTDIR)$(docdir)/{} \; ; \ + fi uninstall-hook: rm -rf $(DESTDIR)$(docdir) diff --git a/docs/conf.py b/docs/conf.py index bf192f5356b..b8b7e8c4690 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -10,9 +10,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os # -- Project information ----------------------------------------------------- @@ -24,8 +22,20 @@ author = 'The Open MPI Community' # The full version, including alpha/beta/rc tags -# Read the Open MPI version from the VERSION file -with open("../VERSION") as fp: +# Read the Open MPI version from the VERSION file in the source tree +# The docs/Makefile.am will set the env var OMPI_VERSION_FILE, because +# we might be doing a VPATH build. +filename = None +if 'OMPI_VERSION_FILE' in os.environ: + filename = os.environ['OMPI_VERSION_FILE'] +elif os.path.exists("../VERSION"): + filename = '../VERSION' + +if filename is None: + print("ERROR: Could not find Open MPI source tree VERSION file") + exit(1) + +with open(filename) as fp: ompi_lines = fp.readlines() ompi_data = dict() diff --git a/docs/index.rst b/docs/index.rst index a1f7d0b6d2f..c339c213622 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -28,9 +28,13 @@ Documentation for Open MPI can be found in the following locations: * - v5.0.0 and later - Web: https://docs.open-mpi.org/ - Tarball: ``docs/_build/html/index.html`` + Included in tarball: ``docs/html/index.html`` - Installed: ``$prefix/share/doc/openmpi/html/index.html`` + Built in source tree (if Sphinx available): ``docs/_build/html/index.html`` + + Installed: ``$docdir/html/index.html`` + + (which defaults to: ``$prefix/share/doc/openmpi/html/index.html``) * - v4.1.x and earlier - See the `legacy Open MPI FAQ `_ diff --git a/docs/installing-open-mpi/packagers.rst b/docs/installing-open-mpi/packagers.rst index 6435abded08..e43d52b101a 100644 --- a/docs/installing-open-mpi/packagers.rst +++ b/docs/installing-open-mpi/packagers.rst @@ -1,3 +1,5 @@ +.. _label-install-packagers: + Advice for packagers ==================== @@ -20,9 +22,26 @@ the following: .. code-block:: sh + # Install Sphinx so that Open MPI can re-build its docs with the + # installed PRRTE's docs + + virtualalenv venv + . ./venv/bin/activate + pip install docs/requirements.txt + ./configure --with-libevent=external --with-hwloc=external \ --with-pmix=external --with-prrte=external ... +.. important:: Note the installation of the Sphinx tool so that Open + MPI can re-build its documentation with the external + PRRTE's documentation. + + Failure to do this will mean Open MPI's documentation + will be correct for the version of PRRTE that is + bundled in the Open MPI distribution, but may not be + entirely correct for the version of PRRTE that you are + building against. + The ``external`` keywords will force Open MPI's ``configure`` to ignore all the bundled libraries and only look for external versions of these support libraries. This also has the benefit of causing @@ -36,6 +55,29 @@ independently-built and installed versions. information about the required support library ``--with-FOO`` command line options. +Have Sphinx installed +--------------------- + +Since you should be (will be) installing Open MPI against an external +PRRTE and PMIx, you should have `Sphinx +`_ installed before running Open MPI's +``configure`` script. + +This will allow Open MPI to (re-)build its documentation according to +the PMIx and PRRTE that you are building against. + +To be clear: the Open MPI distribution tarball comes with pre-built +documentation |mdash| rendered in HTML and nroff |mdash| that is +suitable for the versions of PRRTE and PMIx that are bundled in that +tarball. + +However, if you are building Open MPI against not-bundled versions of +PRRTE / PMIx (as all packagers should be), Open MPI needs to re-build +its documentation with specific information from those external PRRTE +/ PMIx installs. For that, you need to have Sphinx installed before +running Open MPI's ``configure`` script. + + .. _label-install-packagers-dso-or-not: Components ("plugins"): DSO or no? diff --git a/docs/installing-open-mpi/required-support-libraries.rst b/docs/installing-open-mpi/required-support-libraries.rst index 9e02297998b..b411e1a02f5 100644 --- a/docs/installing-open-mpi/required-support-libraries.rst +++ b/docs/installing-open-mpi/required-support-libraries.rst @@ -399,6 +399,5 @@ Open MPI package should not include Hwloc, Libevent, PMIx, or PRRTE. Instead, it should depend on external, independently-built versions of these packages. -See the :ref:`Advice for packagers -` section for more -details. +See the :ref:`Advice for packagers ` section +for more details. diff --git a/docs/man-openmpi/man1/mpirun.1.rst b/docs/man-openmpi/man1/mpirun.1.rst index 66a0e75c269..c9168b60076 100644 --- a/docs/man-openmpi/man1/mpirun.1.rst +++ b/docs/man-openmpi/man1/mpirun.1.rst @@ -60,15 +60,17 @@ probably want to use a command line of the following form: This will run ``X`` copies of ```` in your current run-time environment (if running under a supported resource manager, Open MPI's -mpirun will usually automatically use the corresponding resource -manager process starter, as opposed to, for example, ``rsh`` or ``ssh``, which -require the use of a hostfile, or will default to running all ``X`` copies -on the localhost), scheduling (by default) in a round-robin fashion by -CPU slot. See the rest of this page for more details. - -Please note that mpirun automatically binds processes as of the start -of the v1.8 series. Three binding patterns are used in the absence of -any further directives (See :ref:`map/rank/bind defaults ` for more details): +``mpirun`` will usually automatically use the corresponding resource +manager process starter, as opposed to ``ssh`` (for example), which +require the use of a hostfile, or will default to running all ``X`` +copies on the localhost), scheduling (by default) in a round-robin +fashion by CPU slot. See the rest of this documentation for more +details. + +Please note that ``mpirun`` automatically binds processes to hardware +resources. Three binding patterns are used in the absence of any +further directives (See :ref:`map/rank/bind defaults +` for more details): * **Bind to core**: when the number of processes is <= 2 * **Bind to package**: when the number of processes is > 2 @@ -79,103 +81,43 @@ that you are either not bound at all (by specifying ``--bind-to none``), or bound to multiple cores using an appropriate binding level or specific number of processing elements per application process. -.. _man1-mpirun-definition-of-slot: - -DEFINITION OF 'SLOT' --------------------- - -The term "slot" is used extensively in the rest of this manual page. -A slot is an allocation unit for a process. The number of slots on a -node indicate how many processes can potentially execute on that node. -By default, Open MPI will allow one process per slot. - -If Open MPI is not explicitly told how many slots are available on a -node (e.g., if a hostfile is used and the number of slots is not -specified for a given node), it will determine a maximum number of -slots for that node in one of two ways: - -#. Default behavior: By default, Open MPI will attempt to discover the - number of processor cores on the node, and use that as the number - of slots available. - -#. When ``--use-hwthread-cpus`` is used: If ``--use-hwthread-cpus`` is - specified on the ``mpirun`` command line, then Open MPI will attempt to - discover the number of hardware threads on the node, and use that - as the number of slots available. - -This default behavior also occurs when specifying the ``--host`` -option with a single host. Thus, the command: - -.. code:: sh - - shell$ mpirun --host node1 ./a.out - -launches a number of processes equal to the number of cores on node -``node1``, whereas: - -.. code:: sh - - shell$ mpirun --host node1 --use-hwthread-cpus ./a.out - -launches a number of processes equal to the number of hardware -threads on ``node1``. - -When Open MPI applications are invoked in an environment managed by a -resource manager (e.g., inside of a Slurm job), and Open MPI was built -with appropriate support for that resource manager, then Open MPI will -be informed of the number of slots for each node by the resource -manager. For example: - -.. code:: sh - - shell$ mpirun ./a.out - -launches one process for every slot (on every node) as dictated by -the resource manager job specification. - -Also note that the one-process-per-slot restriction can be overridden -in unmanaged environments (e.g., when using hostfiles without a -resource manager) if oversubscription is enabled (by default, it is -disabled). Most MPI applications and HPC environments do not -oversubscribe; for simplicity, the majority of this documentation -assumes that oversubscription is not enabled. - -Slots are not hardware resources -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +OPEN MPI'S USE OF PRRTE +----------------------- -Slots are frequently incorrectly conflated with hardware resources. -It is important to realize that slots are an entirely different metric -than the number (and type) of hardware resources available. +Open MPI uses the PMIx Reference Runtime Environment (PRRTE) as the +main engine for launching, monitoring, and terminating MPI processes. -Here are some examples that may help illustrate the difference: +Much of the documentation below is directly imported from PRRTE. As +such, it frequently refers to PRRTE concepts and command line options. +Except where noted, these concepts and command line argument are all +applicable to Open MPI as well. Open MPI extends the available PRRTE +command line options, and also slightly modifies the PRRTE's default +behaviors in a few cases. These will be specifically described in the +docuemtnation below. -#. More processor cores than slots: Consider a resource manager job - environment that tells Open MPI that there is a single node with 20 - processor cores and 2 slots available. By default, Open MPI will - only let you run up to 2 processes. - - Meaning: you run out of slots long before you run out of processor - cores. +COMMAND LINE OPTIONS +-------------------- -#. More slots than processor cores: Consider a hostfile with a single - node listed with a ``slots=50`` qualification. The node has 20 - processor cores. By default, Open MPI will let you run up to 50 - processes. +The core of Open MPI's ``mpirun`` processing is performed via the +`PRRTE `_. Specifically: ``mpirun`` is +effectively a wrapper around ``prterun``, but ``mpirun``'s CLI options +are slightly different than PRRTE's CLI commands. - Meaning: you can run many more processes than you have processor - cores. +.. include:: /schizo-ompi-rst-content/schizo-ompi-cli.rstxt -.. _man1-mpirun-definition-of-processor-element: +OPTIONS (OLD / HARD-CODED CONTENT -- TO BE AUDITED +-------------------------------------------------- -DEFINITION OF 'PROCESSOR ELEMENT' ---------------------------------- +.. admonition:: This is old content + :class: error -By default, Open MPI defines that a "processing element" is a -processor core. However, if ``--use-hwthread-cpus`` is specified on the -mpirun command line, then a "processing element" is a hardware thread. + This is the old section of manually hard-coded content. It should + probably be read / audited and see what we want to keep and what we + want to discard. -OPTIONS -------- + Feel free to refer to https://docs.prrte.org/ rather than + replicating content here (e.g., for the definition of a slot and + other things). mpirun will send the name of the directory where it was invoked on the local node to each of the remote nodes, and attempt to change to that @@ -251,10 +193,11 @@ processes will be bound to the package. context. If no value is provided for the number of copies to execute (i.e., neither the ``-n`` nor its synonyms are provided on the command line), Open MPI will automatically execute a copy of the - program on each process slot (see :ref:`defintion of slot ` for description of a - "process slot"). This feature, however, can only be used in the SPMD - model and will return an error (without beginning execution of the - application) otherwise. + program on each process slot (see PRRTE's `defintion of "slot" + `_ + for description of a "process slot"). This feature, however, can + only be used in the SPMD model and will return an error (without + beginning execution of the application) otherwise. .. note:: The ``-n`` option is the preferred option to be used to specify the number of copies of the program to be executed, but the alternate @@ -280,7 +223,7 @@ To map processes: * ``--map-by ``: Map to the specified object, defaults to ``package``. Supported options include ``slot``, ``hwthread``, ``core``, ``L1cache``, ``L2cache``, ``L3cache``, ``package``, ``numa``, - ``node``, ``seq``, ``rankfile``, ``pe-list=#``, and ``ppr``. + ``node``, ``seq``, ``rankfile``, ``pe-list=#``, and ``ppr``. Any object can include modifiers by adding a ``:`` and any combination of the following: @@ -561,13 +504,17 @@ There are also other options: Note that if a number of slots is not provided to Open MPI (e.g., via the ``slots`` keyword in a hostfile or from a resource manager such as Slurm), the use of this option changes the default - calculation of number of slots on a node. See the :ref:`DEFINITION - OF 'SLOT' ` section. + calculation of number of slots on a node. See the PRRTE's + `defintion of "slot" + `_ + for more details. Also note that the use of this option changes the Open MPI's definition of a "processor element" from a processor core to a - hardware thread. See the :ref:`DEFINITION OF 'PROCESSOR ELEMENT' - ` section. + hardware thread. See + PRRTE's `defintion of a "processor element" + `_ + for more details. The following options are useful for developers; they are not generally useful to most Open MPI users: @@ -601,11 +548,23 @@ There may be other options listed with ``mpirun --help``. Environment Variables ^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + * ``MPIEXEC_TIMEOUT``: Synonym for the ``--timeout`` command line option. DESCRIPTION ----------- +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + One invocation of ``mpirun`` starts an MPI application running under Open MPI. If the application is single process multiple data (SPMD), the application can be specified on the ``mpirun`` command line. @@ -630,6 +589,12 @@ while others are specific to a single program (e.g., ``-n``). Specifying Host Nodes ^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Host nodes can be identified on the ``mpirun`` command line with the ``--host`` option or in a hostfile. @@ -679,6 +644,12 @@ from the resource manager. Specifying Number of Processes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + As we have just seen, the number of processes to run can be set using the hostfile. Other mechanisms exist. @@ -733,6 +704,12 @@ the ``-n`` option indicated that only 6 processes should be launched. Mapping Processes to Nodes: Using Policies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The examples above illustrate the default mapping of process processes to nodes. This mapping can also be controlled with various ``mpirun`` options that describe mapping policies. @@ -845,6 +822,12 @@ and 2 each running uptime on nodes ``bb`` and ``cc``, respectively. Mapping, Ranking, and Binding: Oh My! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Open MPI employs a three-phase procedure for assigning process locations and ranks: @@ -934,7 +917,7 @@ Alternatively, processes can be mapped and bound to specified cores using the ``--map-by pe-list=`` option. For example, ``--map-by pe-list=0,2,5`` will map three processes all three of which will be bound to logical cores ``0,2,5``. If you intend to bind each of the three processes to different -cores then the ``:ordered`` qualifier can be used like +cores then the ``:ordered`` qualifier can be used like ``--map-by pe-list=0,2,5:ordered``. In this example, the first process on a node will be bound to CPU 0, the second process on the node will be bound to CPU 2, and the third process on the node will be bound to @@ -992,7 +975,7 @@ in ranking when the ``span`` qualifier is used instead of the default. In the above case, the output shows us that 2 cores have been bound per process. Specifically, the mapping by ``slot`` with the ``PE=2`` qualifier indicated that each slot (i.e., process) should consume two processor -elements. By default, Open MPI defines "processor element" as "core", +elements. By default, Open MPI defines "processor element" as "core", and therefore the ``--bind-to core`` caused each process to be bound to both of the cores to which it was mapped. @@ -1030,16 +1013,16 @@ MCA parameters can be set not only on the mpirun command line, but alternatively in a system or user ``mca-params.conf`` file or as environment variables, as described in the :ref:`Setting MCA Parameters `. These are MCA parameters for -the PRRTE runtime so the command line argument ``--PRRTEmca`` must be used to +the PRRTE runtime so the command line argument ``--PRRTEmca`` must be used to pass the MCA parameter key/value pair. Alternatively, the MCA parameter key/ -value pair may be specific on the command line by prefixing the key with +value pair may be specific on the command line by prefixing the key with ``PRRTE_MCA_``. Some examples include: .. list-table:: :header-rows: 1 * - Option - - PRRTE MCA parameter key + - PRRTE MCA parameter key - Value * - ``--map-by core`` @@ -1071,6 +1054,12 @@ value pair may be specific on the command line by prefixing the key with Defaults for Mapping, Ranking, and Binding ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + If the user does not specify each of ``--map-by``, ``--rank-by``, and ``--bind-to`` option then the default values are as follows: * If no options are specified then @@ -1167,6 +1156,12 @@ The mapping pattern might be better seen if we change the default ``--rank-by`` Rankfiles ^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Rankfiles are text files that specify detailed information about how individual processes should be mapped to nodes, and to which processor(s) they should be bound. Each line of a rankfile specifies @@ -1226,6 +1221,12 @@ indexes of package and cores. Application Context or Executable Program? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + To distinguish the two different forms, mpirun looks on the command line for ``--app`` option. If it is specified, then the file named on the command line is assumed to be an application context. If it is @@ -1234,6 +1235,12 @@ not specified, then the file is assumed to be an executable program. Locating Files ^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + If no relative or absolute path is specified for a file, Open MPI will first look for files by searching the directories specified by the ``--path`` option. If there is no ``--path`` option set or if the @@ -1252,6 +1259,12 @@ current working directory from the invocation of ``mpirun``. Current Working Directory ^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The ``--wdir`` ``mpirun`` option (and its synonym, ``--wd``) allows the user to change to an arbitrary directory before the program is invoked. It can also be used in application context files to specify @@ -1279,6 +1292,12 @@ does not wait until :ref:`MPI_INIT(3) ` is called. Standard I/O ^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Open MPI directs UNIX standard input to ``/dev/null`` on all processes except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process inherits standard input from ``mpirun``. @@ -1309,6 +1328,12 @@ will be collected into the ``my_output`` file. Signal Propagation ^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + When ``mpirun`` receives a SIGTERM and SIGINT, it will attempt to kill the entire job by sending all processes in the job a SIGTERM, waiting a small number of seconds, then sending all processes in the job a @@ -1326,6 +1351,12 @@ Other signals are not currently propagated by ``mpirun``. Process Termination / Signal Handling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + During the run of an MPI application, if any process dies abnormally (either exiting before invoking :ref:`MPI_FINALIZE(3) `, or dying as the result of a signal), ``mpirun`` will print out an @@ -1346,6 +1377,12 @@ safest) for the user to only clean up non-MPI state. Process Environment ^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Processes in the MPI application inherit their environment from the PRRTE daemon upon the node on which they are running. The environment is typically inherited from the user's shell. On remote @@ -1365,6 +1402,12 @@ for more details. Remote Execution ^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Open MPI requires that the ``PATH`` environment variable be set to find executables on remote nodes (this is typically only necessary in rsh- or ssh-based environments |mdash| batch/scheduled environments @@ -1431,6 +1474,12 @@ is equivalent to Exported Environment Variables ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + All environment variables that are named in the form ``OMPI_*`` will automatically be exported to new processes on the local and remote nodes. Environmental parameters can also be set/forwarded to the new @@ -1448,6 +1497,12 @@ them. Setting MCA Parameters ^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The ``--mca`` switch allows the passing of parameters to various MCA (Modular Component Architecture) modules. MCA modules have direct impact on MPI programs because they allow tunable parameters to be set @@ -1508,6 +1563,12 @@ page for detailed information on this command. Setting MCA parameters and environment variables from file ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The ``--tune`` command line option and its synonym ``--mca`` ``mca_base_envar_file_prefix`` allows a user to set MCA parameters and environment variables with the syntax described below. This option @@ -1532,6 +1593,12 @@ have higher precedence than variables specified in the file. Running as root ^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + .. warning:: The Open MPI team **strongly** advises against executing ``mpirun`` as the root user. MPI applications should be run as regular (non-root) users. @@ -1558,6 +1625,12 @@ against this behavior. Exit status ^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + There is no standard definition for what ``mpirun`` should return as an exit status. After considerable discussion, we settled on the following method for assigning the ``mpirun`` exit status (note: in @@ -1599,6 +1672,12 @@ bullet points above). EXAMPLES -------- +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Be sure also to see the examples throughout the sections above. .. code:: sh @@ -1613,6 +1692,12 @@ messages. RETURN VALUE ------------ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + ``mpirun`` returns 0 if all processes started by mpirun exit after calling :ref:`MPI_FINALIZE(3) `. A non-zero value is returned if an internal error occurred in mpirun, or one or more diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index bd1bc89ad57..54445d8b743 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -4,9 +4,9 @@ Open MPI v5.0.x series This file contains all the NEWS updates for the Open MPI v5.0.x series, in reverse chronological order. -Open MPI version 5.0.0rc12 +Open MPI version 5.0.0rc13 -------------------------- -:Date: 19 May 2023 +:Date: 29 September 2023 .. admonition:: The MPIR API has been removed :class: warning @@ -66,30 +66,23 @@ Open MPI version 5.0.0rc12 Libevent symbols and then statically pulled the library into ``libmpi.so``. -- Changes since rc11: - - - ``accelerator/rocm``: add SYNC_MEMOPS support. - - Update PMIx, PRRTe, and OAC submodule pointers. - - Fix ``mca_btl_ofi_flush()`` in multithreaded environments.. - - ``smcuda``: fixed an edge case when building MCA components as - dynamic shared objects. - - Fix ``MPI_Session_init()`` bug if all previous sessions are - finalized. - - Fix `mpi4py `_ hang in - ``MPI_Intercomm_create_from_groups()``. - - Fix finalization segfault with OSHMEM 4.1.5. - - Improve AVX detection. Fixes ``op/avx`` link failure with the - ``nvhpc`` compiler. - - Fix incorrect results with ``pml/ucx`` using Intel compiler. - - Fix segfault when broadcasting large MPI structs. - - Add platform files for Google Cloud HPC. - - UCC/HCOLL: Fix ``MPI_Waitall()`` for non blokcing collectives. - - Fix pre-built docs check. +- Changes since rc12: + + - Update PMIx to the ``v4.2.6`` release tag. Hash: ``f20e0d5``. + - Update PRRTE to the ``v3.0.1`` release tag. Hash: ``63370ca``. + - Lots of documentation updates. + - Fixed parameter name in ``MPI_Intercomm_merge``. Thanks to Yan Wu for the report. + - ``OFI``: Update NIC selection to determine optimal interfaces from the current process. + - Fix reordering of received data in ``MPI_Gather``. + - Disable builds with ``HWLOC`` versions >= 3.0.0. This is currently not supported. + - Fix re-ordering of ranks in ``MPI_Dist_graph_create``. + - ``coll/HAN``: Fix bug when using ``MPI_IN_PLACE`` with ``MPI_Reduce``. + - Fix ``MPI_Type_Dup`` to propagate errors from inner calls. + - Fix the compilation of the monitoring infrastructure. + - Various other bug fixes. - All other notable updates for v5.0.0: - - Update PMIx to the ``v4.2`` branch - current hash: ``f34a7ce2``. - - Update PRRTE to the ``v3.0`` branch - current hash: ``c4925aa5cc``. - New Features: - ULFM Fault Tolerance support has been added. See :ref:`the ULFM @@ -154,9 +147,10 @@ Open MPI version 5.0.0rc12 - Many MPI one-sided and RDMA emulation fixes for the ``tcp`` BTL. - - This patch series fixs many issues when running with ``--mca - osc rdma --mca btl tcp``, i.e., TCP support for one sided - MPI calls. + This patch series fixs many issues when running with ``--mca + osc rdma --mca btl tcp``, i.e., TCP support for one sided + MPI calls. + - Many MPI one-sided fixes for the ``uct`` BTL. - Added support for ``acc_single_intrinsic`` to the one-sided ``ucx`` component. diff --git a/docs/no-prrte-content.rst.txt b/docs/no-prrte-content.rst.txt new file mode 100644 index 00000000000..ea034952d31 --- /dev/null +++ b/docs/no-prrte-content.rst.txt @@ -0,0 +1,24 @@ +.. This file is only used in certain cases. Hence, the original file + in the Open MPI "docs" source tree ends in ".txt", so that Sphinx + will not complain if it is not used. If it *is* used, it is copied + to another file (that ends in ".rst") so that it can be properly + found / used by Sphinx. + +No content +^^^^^^^^^^ + +There is no meaningful content in this file because Open MPI was either: + +* Built without PRRTE support. + +* Built with a PRRTE that was too old to include machine-readable + documentation that could be incorporated into Open MPI's + documentation. + +If you build Open MPI with a newer version of PRRTE (and have the +Sphinx tool available when you run Open MPI's ``configure`` command), +you should get more meaningful documentation here. + +Hence, there is no documentation for this section. + +Sorry! diff --git a/ompi/group/group.h b/ompi/group/group.h index 58251892015..c188e98f02f 100644 --- a/ompi/group/group.h +++ b/ompi/group/group.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2020 The University of Tennessee and The University + * Copyright (c) 2004-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -459,19 +459,17 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t */ static inline int ompi_group_proc_lookup_rank (ompi_group_t* group, ompi_proc_t* proc) { - int i, np, v; + int i, np, rank; + opal_vpid_t v; assert( NULL != proc ); assert( !ompi_proc_is_sentinel(proc) ); np = ompi_group_size(group); if( 0 == np ) return MPI_PROC_NULL; /* heuristic: On comm_world, start the lookup from v=vpid, so that - * when working on comm_world, the search is O(1); - * Otherwise, wild guess: start from a proportional position - * compared to comm_world position. */ + * when working on comm_world, on average, the search remains O(1). */ v = proc->super.proc_name.vpid; - v = (v 0 ) { @@ -455,7 +443,9 @@ mca_coll_han_init_dynamic_rules(void) fclose(fptr); check_dynamic_rules(); + free(coll_name); free(algorithm_name); + free(target_comp_name); return OMPI_SUCCESS; cannot_allocate: @@ -465,10 +455,9 @@ mca_coll_han_init_dynamic_rules(void) opal_output_verbose(0, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules " "cannot allocate dynamic rules\n"); - if( NULL != coll_name ) { - free(coll_name); - } + free(coll_name); free(algorithm_name); + free(target_comp_name); fclose (fptr); /* We disable the module, we don't need to keep the rules */ mca_coll_han_free_dynamic_rules(); @@ -481,10 +470,9 @@ mca_coll_han_init_dynamic_rules(void) "Will use mca parameters defined rules. " "To see error detail, please set " "collective verbosity level over 5\n"); - if( NULL != coll_name ) { - free(coll_name); - } + free(coll_name); free(algorithm_name); + free(target_comp_name); fclose (fptr); /* We disable the module, we don't need to keep the rules */ mca_coll_han_free_dynamic_rules(); diff --git a/ompi/mpi/fortran/base/fint_2_int.h b/ompi/mpi/fortran/base/fint_2_int.h index d3c71454386..ec2ba43fa1b 100644 --- a/ompi/mpi/fortran/base/fint_2_int.h +++ b/ompi/mpi/fortran/base/fint_2_int.h @@ -60,7 +60,7 @@ /* This is for OUT parameters. Does only alloc */ #define OMPI_ARRAY_FINT_2_INT_ALLOC(in, n) \ - OMPI_ARRAY_NAME_CONVERT(in) = malloc(n * sizeof(int)) + OMPI_ARRAY_NAME_CONVERT(in) = malloc((n) * sizeof(int)) /* This is for IN/IN-OUT parameters. Does alloc and assignment */ #define OMPI_ARRAY_FINT_2_INT(in, n) \ @@ -117,7 +117,7 @@ /* This is for OUT parameters. Does only alloc */ #define OMPI_ARRAY_FINT_2_INT_ALLOC(in, n) \ - OMPI_ARRAY_NAME_CONVERT(in) = malloc(n * sizeof(int)) + OMPI_ARRAY_NAME_CONVERT(in) = malloc((n) * sizeof(int)) #define OMPI_ARRAY_FINT_2_INT(in, n) \ do { \ @@ -204,7 +204,7 @@ # define OMPI_LOGICAL_ARRAY_NAME_DECL(in) int * c_##in # define OMPI_LOGICAL_ARRAY_NAME_CONVERT(in) c_##in # define OMPI_ARRAY_LOGICAL_2_INT_ALLOC(in,n) \ - OMPI_LOGICAL_ARRAY_NAME_CONVERT(in) = malloc(n * sizeof(int)) + OMPI_LOGICAL_ARRAY_NAME_CONVERT(in) = malloc((n) * sizeof(int)) # define OMPI_ARRAY_LOGICAL_2_INT_CLEANUP(in) \ free(OMPI_LOGICAL_ARRAY_NAME_CONVERT(in)) diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index e401a48b81e..36c01537895 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2020 The University of Tennessee and The University + * Copyright (c) 2004-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -105,10 +105,17 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd) { ssize_t cnt; size_t i, num_vecs; + struct msghdr msg = { + .msg_iov = frag->iov_ptr, + .msg_iovlen = frag->iov_cnt }; + int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; - /* non-blocking write, but continue if interrupted */ + /* non-blocking write, continue if interrupted */ do { - cnt = writev(sd, frag->iov_ptr, frag->iov_cnt); + /* Use sendmsg to avoid issues with SIGPIPE as described in + * https://blog.erratasec.com/2018/10/tcpip-sockets-and-sigpipe.html# + */ + cnt = sendmsg(sd, &msg, msg_flags); if (cnt < 0) { switch (opal_socket_errno) { case EINTR: @@ -116,7 +123,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd) case EWOULDBLOCK: return false; case EFAULT: - BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %lu)\n\t%s(%lu)\n", + BTL_ERROR(("mca_btl_tcp_frag_send: sendmsg error (%p, %lu)\n\t%s(%lu)\n", frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len, strerror(opal_socket_errno), (unsigned long) frag->iov_cnt)); /* send_lock held by caller */ @@ -125,7 +132,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd) return false; default: BTL_PEER_ERROR(frag->endpoint->endpoint_proc->proc_opal, - ("mca_btl_tcp_frag_send: writev failed: %s (%d)", + ("mca_btl_tcp_frag_send: sendmsg failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); /* send_lock held by caller */ frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; diff --git a/opal/util/Makefile.am b/opal/util/Makefile.am index 646f44412b2..23f6b0ccd67 100644 --- a/opal/util/Makefile.am +++ b/opal/util/Makefile.am @@ -63,7 +63,6 @@ headers = \ numtostr.h \ opal_environ.h \ opal_getcwd.h \ - opal_pty.h \ os_dirpath.h \ os_path.h \ output.h \ @@ -108,7 +107,6 @@ libopalutil_core_la_SOURCES = \ numtostr.c \ opal_environ.c \ opal_getcwd.c \ - opal_pty.c \ os_dirpath.c \ os_path.c \ output.c \ diff --git a/opal/util/opal_pty.c b/opal/util/opal_pty.c deleted file mode 100644 index adbbc8570bb..00000000000 --- a/opal/util/opal_pty.c +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2018 Cisco Systems, Inc. All rights reserved - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/*- - * Copyright (c) 1990, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "opal_config.h" - -#ifdef HAVE_SYS_CDEFS_H -# include -#endif -#ifdef HAVE_SYS_TYPES_H -# include -#endif -#include -#ifdef HAVE_SYS_IOCTL_H -# include -#endif -#ifdef HAVE_FCNTL_H -# include -#endif -#ifdef HAVE_TERMIOS_H -# include -#else -# ifdef HAVE_TERMIO_H -# include -# endif -#endif -#include -#ifdef HAVE_UNISTD_H -# include -#endif -#include -#include -#ifdef HAVE_GRP_H -# include -#endif -#ifdef HAVE_PTY_H -# include -#endif -#ifdef HAVE_UTMP_H -# include -#endif - -#ifdef HAVE_PTSNAME -# include -# ifdef HAVE_STROPTS_H -# include -# endif -#endif - -#ifdef HAVE_UTIL_H -# include -#endif - -#include "opal/util/opal_pty.h" - -/* The only public interface is openpty - all others are to support - openpty() */ - -#if OPAL_ENABLE_PTY_SUPPORT == 0 - -int opal_openpty(int *amaster, int *aslave, char *name, void *termp, void *winpp) -{ - return -1; -} - -#elif defined(HAVE_OPENPTY) - -int opal_openpty(int *amaster, int *aslave, char *name, struct termios *termp, struct winsize *winp) -{ - return openpty(amaster, aslave, name, termp, winp); -} - -#else - -/* implement openpty in terms of ptym_open and ptys_open */ - -static int ptym_open(char *pts_name); -static int ptys_open(int fdm, char *pts_name); - -int opal_openpty(int *amaster, int *aslave, char *name, struct termios *termp, struct winsize *winp) -{ - char line[20]; - *amaster = ptym_open(line); - if (*amaster < 0) { - return -1; - } - *aslave = ptys_open(*amaster, line); - if (*aslave < 0) { - close(*amaster); - return -1; - } - if (name) { - // We don't know the max length of name, but we do know the - // max length of the source, so at least use that. - opal_string_copy(name, line, sizeof(line)); - } -# ifndef TCSAFLUSH -# define TCSAFLUSH TCSETAF -# endif - if (termp) { - (void) tcsetattr(*aslave, TCSAFLUSH, termp); - } -# ifdef TIOCSWINSZ - if (winp) { - (void) ioctl(*aslave, TIOCSWINSZ, (char *) winp); - } -# endif - return 0; -} - -static int ptym_open(char *pts_name) -{ - int fdm; -# ifdef HAVE_PTSNAME - char *ptr; - -# ifdef _AIX - strcpy(pts_name, "/dev/ptc"); -# else - strcpy(pts_name, "/dev/ptmx"); -# endif - fdm = open(pts_name, O_RDWR); - if (fdm < 0) { - return -1; - } - if (grantpt(fdm) < 0) { /* grant access to slave */ - close(fdm); - return -2; - } - if (unlockpt(fdm) < 0) { /* clear slave's lock flag */ - close(fdm); - return -3; - } - ptr = ptsname(fdm); - if (ptr == NULL) { /* get slave's name */ - close(fdm); - return -4; - } - strcpy(pts_name, ptr); /* return name of slave */ - return fdm; /* return fd of master */ -# else - char *ptr1, *ptr2; - - strcpy(pts_name, "/dev/ptyXY"); - /* array index: 012345689 (for references in following code) */ - for (ptr1 = "pqrstuvwxyzPQRST"; *ptr1 != 0; ptr1++) { - pts_name[8] = *ptr1; - for (ptr2 = "0123456789abcdef"; *ptr2 != 0; ptr2++) { - pts_name[9] = *ptr2; - /* try to open master */ - fdm = open(pts_name, O_RDWR); - if (fdm < 0) { - if (errno == ENOENT) { /* different from EIO */ - return -1; /* out of pty devices */ - } else { - continue; /* try next pty device */ - } - } - pts_name[5] = 't'; /* change "pty" to "tty" */ - return fdm; /* got it, return fd of master */ - } - } - return -1; /* out of pty devices */ -# endif -} - -static int ptys_open(int fdm, char *pts_name) -{ - int fds; -# ifdef HAVE_PTSNAME - /* following should allocate controlling terminal */ - fds = open(pts_name, O_RDWR); - if (fds < 0) { - close(fdm); - return -5; - } -# if defined(__SVR4) && defined(__sun) - if (ioctl(fds, I_PUSH, "ptem") < 0) { - close(fdm); - close(fds); - return -6; - } - if (ioctl(fds, I_PUSH, "ldterm") < 0) { - close(fdm); - close(fds); - return -7; - } -# endif - - return fds; -# else - int gid; - struct group *grptr; - - grptr = getgrnam("tty"); - if (grptr != NULL) { - gid = grptr->gr_gid; - } else { - gid = -1; /* group tty is not in the group file */ - } - /* following two functions don't work unless we're root */ - chown(pts_name, getuid(), gid); - chmod(pts_name, S_IRUSR | S_IWUSR | S_IWGRP); - fds = open(pts_name, O_RDWR); - if (fds < 0) { - close(fdm); - return -1; - } - return fds; -# endif -} - -#endif /* #ifdef HAVE_OPENPTY */ diff --git a/opal/util/opal_pty.h b/opal/util/opal_pty.h deleted file mode 100644 index f30cd97d5ec..00000000000 --- a/opal/util/opal_pty.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OPAL_UTIL_PTY_H -#define OPAL_UTIL_PTY_H - -#include "opal_config.h" - -#ifdef HAVE_UTIL_H -# include -#endif -#ifdef HAVE_LIBUTIL_H -# include -#endif -#ifdef HAVE_TERMIOS_H -# include -#else -# ifdef HAVE_TERMIO_H -# include -# endif -#endif - -BEGIN_C_DECLS - -#if OPAL_ENABLE_PTY_SUPPORT - -OPAL_DECLSPEC int opal_openpty(int *amaster, int *aslave, char *name, struct termios *termp, - struct winsize *winp); - -#else - -OPAL_DECLSPEC int opal_openpty(int *amaster, int *aslave, char *name, void *termp, void *winpp); - -#endif - -END_C_DECLS - -#endif /* OPAL_UTIL_PTY_H */ diff --git a/opal/win32/opal_uio.c b/opal/win32/opal_uio.c index 0270e0f4f7b..3c4bfe7550b 100644 --- a/opal/win32/opal_uio.c +++ b/opal/win32/opal_uio.c @@ -2,7 +2,7 @@ Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana University Research and Technology Corporation. All rights reserved. - Copyright (c) 2004-2005 The University of Tennessee and The University + Copyright (c) 2004-2023 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -26,12 +26,12 @@ of code to handle the windows error flags */ -int writev(int fd, struct iovec *iov, int cnt) +ssize_t sendmsg(int fd, const struct msghdr *message, int flags) { int err; DWORD sendlen; - err = WSASend((SOCKET) fd, &(iov->data), cnt, &sendlen, 0, NULL, NULL); + err = WSASendMsg((SOCKET) fd, message, flags, &sendlen, NULL, NULL); if (err < 0) { return err; diff --git a/opal/win32/opal_uio.h b/opal/win32/opal_uio.h index 2691b0bd3d4..642beda1128 100644 --- a/opal/win32/opal_uio.h +++ b/opal/win32/opal_uio.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -33,14 +33,14 @@ struct iovec { #define iov_len data.len BEGIN_C_DECLS + /* - * writev: - writev writes data to file descriptor fd, and from the buffers - described by iov. The number of buffers is specified by cnt. The - buffers are used in the order specified. Operates just like write - except that data is taken from iov instead of a contiguous buffer. + * sendmsg: + * writes data to a file descriptor. This is a convenience function to allow + * the TCP BTL to support Windows. Overall is should behave similarly to the + * POSIX sendmsg function. */ -OPAL_DECLSPEC int writev(int fd, struct iovec *iov, int cnt); +OPAL_DECLSPEC ssize_t sendmsg(int socket, const struct msghdr *message, int flags); /* readv reads data from file descriptor fd, and puts the result in the diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 570b4d25a7a..5493d78e661 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -22,6 +22,7 @@ #include "opal/datatype/opal_convertor.h" #include "opal/mca/common/ucx/common_ucx.h" #include "opal/util/opal_environ.h" +#include "opal/util/minmax.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/mca/pml/pml.h" @@ -126,6 +127,171 @@ static ucp_request_param_t mca_spml_ucx_request_param_b = { }; #endif +unsigned +mca_spml_ucx_mem_map_flags_symmetric_rkey(struct mca_spml_ucx *spml_ucx) +{ +#if HAVE_DECL_UCP_MEM_MAP_SYMMETRIC_RKEY + if (spml_ucx->symmetric_rkey_max_count > 0) { + return UCP_MEM_MAP_SYMMETRIC_RKEY; + } +#endif + + return 0; +} + +void mca_spml_ucx_rkey_store_init(mca_spml_ucx_rkey_store_t *store) +{ + store->array = NULL; + store->count = 0; + store->size = 0; +} + +void mca_spml_ucx_rkey_store_cleanup(mca_spml_ucx_rkey_store_t *store) +{ + int i; + + for (i = 0; i < store->count; i++) { + if (store->array[i].refcnt != 0) { + SPML_UCX_ERROR("rkey store destroy: %d/%d has refcnt %d > 0", + i, store->count, store->array[i].refcnt); + } + + ucp_rkey_destroy(store->array[i].rkey); + } + + free(store->array); +} + +/** + * Find position in sorted array for existing or future entry + * + * @param[in] store Store of the entries + * @param[in] worker Common worker for rkeys used + * @param[in] rkey Remote key to search for + * @param[out] index Index of entry + * + * @return + * OSHMEM_ERR_NOT_FOUND: index contains the position where future element + * should be inserted to keep array sorted + * OSHMEM_SUCCESS : index contains the position of the element + * Other error : index is not valid + */ +static int mca_spml_ucx_rkey_store_find(const mca_spml_ucx_rkey_store_t *store, + const ucp_worker_h worker, + const ucp_rkey_h rkey, + int *index) +{ +#if HAVE_DECL_UCP_RKEY_COMPARE + ucp_rkey_compare_params_t params; + int i, result, m, end; + ucs_status_t status; + + for (i = 0, end = store->count; i < end;) { + m = (i + end) / 2; + + params.field_mask = 0; + status = ucp_rkey_compare(worker, store->array[m].rkey, + rkey, ¶ms, &result); + if (status != UCS_OK) { + return OSHMEM_ERROR; + } else if (result == 0) { + *index = m; + return OSHMEM_SUCCESS; + } else if (result > 0) { + end = m; + } else { + i = m + 1; + } + } + + *index = i; + return OSHMEM_ERR_NOT_FOUND; +#else + return OSHMEM_ERROR; +#endif +} + +static void mca_spml_ucx_rkey_store_insert(mca_spml_ucx_rkey_store_t *store, + int i, ucp_rkey_h rkey) +{ + int size; + mca_spml_ucx_rkey_t *tmp; + + if (store->count >= mca_spml_ucx.symmetric_rkey_max_count) { + return; + } + + if (store->count >= store->size) { + size = opal_min(opal_max(store->size, 8) * 2, + mca_spml_ucx.symmetric_rkey_max_count); + tmp = realloc(store->array, size * sizeof(*store->array)); + if (tmp == NULL) { + return; + } + + store->array = tmp; + store->size = size; + } + + memmove(&store->array[i + 1], &store->array[i], + (store->count - i) * sizeof(*store->array)); + store->array[i].rkey = rkey; + store->array[i].refcnt = 1; + store->count++; + return; +} + +/* Takes ownership of input ucp remote key */ +static ucp_rkey_h mca_spml_ucx_rkey_store_get(mca_spml_ucx_rkey_store_t *store, + ucp_worker_h worker, + ucp_rkey_h rkey) +{ + int ret, i; + + if (mca_spml_ucx.symmetric_rkey_max_count == 0) { + return rkey; + } + + ret = mca_spml_ucx_rkey_store_find(store, worker, rkey, &i); + if (ret == OSHMEM_SUCCESS) { + ucp_rkey_destroy(rkey); + store->array[i].refcnt++; + return store->array[i].rkey; + } + + if (ret == OSHMEM_ERR_NOT_FOUND) { + mca_spml_ucx_rkey_store_insert(store, i, rkey); + } + + return rkey; +} + +static void mca_spml_ucx_rkey_store_put(mca_spml_ucx_rkey_store_t *store, + ucp_worker_h worker, + ucp_rkey_h rkey) +{ + mca_spml_ucx_rkey_t *entry; + int ret, i; + + ret = mca_spml_ucx_rkey_store_find(store, worker, rkey, &i); + if (ret != OSHMEM_SUCCESS) { + goto out; + } + + entry = &store->array[i]; + assert(entry->rkey == rkey); + if (--entry->refcnt > 0) { + return; + } + + memmove(&store->array[i], &store->array[i + 1], + (store->count - (i + 1)) * sizeof(*store->array)); + store->count--; + +out: + ucp_rkey_destroy(rkey); +} + int mca_spml_ucx_enable(bool enable) { SPML_UCX_VERBOSE(50, "*** ucx ENABLED ****"); @@ -240,6 +406,7 @@ int mca_spml_ucx_ctx_mkey_add(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn { int rc; ucs_status_t err; + ucp_rkey_h rkey; rc = mca_spml_ucx_ctx_mkey_new(ucx_ctx, pe, segno, ucx_mkey); if (OSHMEM_SUCCESS != rc) { @@ -248,11 +415,18 @@ int mca_spml_ucx_ctx_mkey_add(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn } if (mkey->u.data) { - err = ucp_ep_rkey_unpack(ucx_ctx->ucp_peers[pe].ucp_conn, mkey->u.data, &((*ucx_mkey)->rkey)); + err = ucp_ep_rkey_unpack(ucx_ctx->ucp_peers[pe].ucp_conn, mkey->u.data, &rkey); if (UCS_OK != err) { SPML_UCX_ERROR("failed to unpack rkey: %s", ucs_status_string(err)); return OSHMEM_ERROR; } + + if (!oshmem_proc_on_local_node(pe)) { + rkey = mca_spml_ucx_rkey_store_get(&ucx_ctx->rkey_store, ucx_ctx->ucp_worker[0], rkey); + } + + (*ucx_mkey)->rkey = rkey; + rc = mca_spml_ucx_ctx_mkey_cache(ucx_ctx, mkey, segno, pe); if (OSHMEM_SUCCESS != rc) { SPML_UCX_ERROR("mca_spml_ucx_ctx_mkey_cache failed"); @@ -267,7 +441,7 @@ int mca_spml_ucx_ctx_mkey_del(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn ucp_peer_t *ucp_peer; int rc; ucp_peer = &(ucx_ctx->ucp_peers[pe]); - ucp_rkey_destroy(ucx_mkey->rkey); + mca_spml_ucx_rkey_store_put(&ucx_ctx->rkey_store, ucx_ctx->ucp_worker[0], ucx_mkey->rkey); ucx_mkey->rkey = NULL; rc = mca_spml_ucx_peer_mkey_cache_del(ucp_peer, segno); if(OSHMEM_SUCCESS != rc){ @@ -725,7 +899,8 @@ sshmem_mkey_t *mca_spml_ucx_register(void* addr, UCP_MEM_MAP_PARAM_FIELD_FLAGS; mem_map_params.address = addr; mem_map_params.length = size; - mem_map_params.flags = flags; + mem_map_params.flags = flags | + mca_spml_ucx_mem_map_flags_symmetric_rkey(&mca_spml_ucx); status = ucp_mem_map(mca_spml_ucx.ucp_context, &mem_map_params, &mem_h); if (UCS_OK != status) { @@ -917,6 +1092,8 @@ static int mca_spml_ucx_ctx_create_common(long options, mca_spml_ucx_ctx_t **ucx } } + mca_spml_ucx_rkey_store_init(&ucx_ctx->rkey_store); + *ucx_ctx_p = ucx_ctx; return OSHMEM_SUCCESS; diff --git a/oshmem/mca/spml/ucx/spml_ucx.h b/oshmem/mca/spml/ucx/spml_ucx.h index a93ff3756a3..2fec131ad2d 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.h +++ b/oshmem/mca/spml/ucx/spml_ucx.h @@ -76,18 +76,31 @@ struct ucp_peer { size_t mkeys_cnt; }; typedef struct ucp_peer ucp_peer_t; - + +/* An rkey_store entry */ +typedef struct mca_spml_ucx_rkey { + ucp_rkey_h rkey; + int refcnt; +} mca_spml_ucx_rkey_t; + +typedef struct mca_spml_ucx_rkey_store { + mca_spml_ucx_rkey_t *array; + int size; + int count; +} mca_spml_ucx_rkey_store_t; + struct mca_spml_ucx_ctx { - ucp_worker_h *ucp_worker; - ucp_peer_t *ucp_peers; - long options; - opal_bitmap_t put_op_bitmap; - unsigned long nb_progress_cnt; - unsigned int ucp_workers; - int *put_proc_indexes; - unsigned put_proc_count; - bool synchronized_quiet; - int strong_sync; + ucp_worker_h *ucp_worker; + ucp_peer_t *ucp_peers; + long options; + opal_bitmap_t put_op_bitmap; + unsigned long nb_progress_cnt; + unsigned int ucp_workers; + int *put_proc_indexes; + unsigned put_proc_count; + bool synchronized_quiet; + int strong_sync; + mca_spml_ucx_rkey_store_t rkey_store; }; typedef struct mca_spml_ucx_ctx mca_spml_ucx_ctx_t; @@ -128,6 +141,7 @@ struct mca_spml_ucx { unsigned long nb_ucp_worker_progress; unsigned int ucp_workers; unsigned int ucp_worker_cnt; + int symmetric_rkey_max_count; }; typedef struct mca_spml_ucx mca_spml_ucx_t; @@ -280,6 +294,11 @@ extern int mca_spml_ucx_team_fcollect(shmem_team_t team, void extern int mca_spml_ucx_team_reduce(shmem_team_t team, void *dest, const void *source, size_t nreduce, int operation, int datatype); +extern unsigned +mca_spml_ucx_mem_map_flags_symmetric_rkey(struct mca_spml_ucx *spml_ucx); + +extern void mca_spml_ucx_rkey_store_init(mca_spml_ucx_rkey_store_t *store); +extern void mca_spml_ucx_rkey_store_cleanup(mca_spml_ucx_rkey_store_t *store); static inline int mca_spml_ucx_peer_mkey_get(ucp_peer_t *ucp_peer, int index, spml_ucx_cached_mkey_t **out_rmkey) diff --git a/oshmem/mca/spml/ucx/spml_ucx_component.c b/oshmem/mca/spml/ucx/spml_ucx_component.c index 1ab00ac1786..e44a800a8be 100644 --- a/oshmem/mca/spml/ucx/spml_ucx_component.c +++ b/oshmem/mca/spml/ucx/spml_ucx_component.c @@ -153,6 +153,10 @@ static int mca_spml_ucx_component_register(void) "Enable asynchronous progress thread", &mca_spml_ucx.async_progress); + mca_spml_ucx_param_register_int("symmetric_rkey_max_count", 0, + "Size of the symmetric key store. Non-zero to enable, typical use 5000", + &mca_spml_ucx.symmetric_rkey_max_count); + mca_spml_ucx_param_register_int("async_tick_usec", 3000, "Asynchronous progress tick granularity (in usec)", &mca_spml_ucx.async_tick); @@ -332,6 +336,8 @@ static int spml_ucx_init(void) mca_spml_ucx_ctx_default.ucp_workers++; } + mca_spml_ucx_rkey_store_init(&mca_spml_ucx_ctx_default.rkey_store); + wrk_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE; err = ucp_worker_query(mca_spml_ucx_ctx_default.ucp_worker[0], &wrk_attr); @@ -436,10 +442,25 @@ static void _ctx_cleanup(mca_spml_ucx_ctx_t *ctx) free(ctx->ucp_peers); } +static void mca_spml_ucx_ctx_fini(mca_spml_ucx_ctx_t *ctx) +{ + unsigned int i; + + mca_spml_ucx_rkey_store_cleanup(&ctx->rkey_store); + for (i = 0; i < ctx->ucp_workers; i++) { + ucp_worker_destroy(ctx->ucp_worker[i]); + } + free(ctx->ucp_worker); + if (ctx != &mca_spml_ucx_ctx_default) { + free(ctx); + } +} + static int mca_spml_ucx_component_fini(void) { int fenced = 0, i; int ret = OSHMEM_SUCCESS; + mca_spml_ucx_ctx_t *ctx; opal_progress_unregister(spml_ucx_default_progress); if (mca_spml_ucx.active_array.ctxs_count) { @@ -492,36 +513,26 @@ static int mca_spml_ucx_component_fini(void) } } - /* delete all workers */ for (i = 0; i < mca_spml_ucx.active_array.ctxs_count; i++) { - ucp_worker_destroy(mca_spml_ucx.active_array.ctxs[i]->ucp_worker[0]); - free(mca_spml_ucx.active_array.ctxs[i]->ucp_worker); - free(mca_spml_ucx.active_array.ctxs[i]); + mca_spml_ucx_ctx_fini(mca_spml_ucx.active_array.ctxs[i]); } for (i = 0; i < mca_spml_ucx.idle_array.ctxs_count; i++) { - ucp_worker_destroy(mca_spml_ucx.idle_array.ctxs[i]->ucp_worker[0]); - free(mca_spml_ucx.idle_array.ctxs[i]->ucp_worker); - free(mca_spml_ucx.idle_array.ctxs[i]); + mca_spml_ucx_ctx_fini(mca_spml_ucx.idle_array.ctxs[i]); } if (mca_spml_ucx_ctx_default.ucp_worker) { - for (i = 0; i < (signed int)mca_spml_ucx.ucp_workers; i++) { - ucp_worker_destroy(mca_spml_ucx_ctx_default.ucp_worker[i]); - } - free(mca_spml_ucx_ctx_default.ucp_worker); + mca_spml_ucx_ctx_fini(&mca_spml_ucx_ctx_default); } if (mca_spml_ucx.aux_ctx != NULL) { - ucp_worker_destroy(mca_spml_ucx.aux_ctx->ucp_worker[0]); - free(mca_spml_ucx.aux_ctx->ucp_worker); + mca_spml_ucx_ctx_fini(mca_spml_ucx.aux_ctx); } mca_spml_ucx.enabled = false; /* not anymore */ free(mca_spml_ucx.active_array.ctxs); free(mca_spml_ucx.idle_array.ctxs); - free(mca_spml_ucx.aux_ctx); SHMEM_MUTEX_DESTROY(mca_spml_ucx.internal_mutex); pthread_mutex_destroy(&mca_spml_ucx.ctx_create_mutex); diff --git a/oshmem/mca/sshmem/ucx/configure.m4 b/oshmem/mca/sshmem/ucx/configure.m4 index 4991c7557c0..7bb9038c5d0 100644 --- a/oshmem/mca/sshmem/ucx/configure.m4 +++ b/oshmem/mca/sshmem/ucx/configure.m4 @@ -28,34 +28,9 @@ AC_DEFUN([MCA_oshmem_sshmem_ucx_CONFIG],[ save_LIBS="$LIBS" save_CPPFLAGS="$CPPFLAGS" - alloc_dm_LDFLAGS=" -L$ompi_check_ucx_libdir/ucx" - alloc_dm_LIBS=" -luct_ib" CPPFLAGS+=" $sshmem_ucx_CPPFLAGS" - LDFLAGS+=" $sshmem_ucx_LDFLAGS $alloc_dm_LDFLAGS" - LIBS+=" $sshmem_ucx_LIBS $alloc_dm_LIBS" - - AC_LANG_PUSH([C]) - AC_LINK_IFELSE([AC_LANG_PROGRAM( - [[ - #include - #include - ]], - [[ - uct_md_h md = ucp_context_find_tl_md((ucp_context_h)NULL, ""); - (void)uct_ib_md_alloc_device_mem(md, NULL, NULL, 0, "", NULL); - uct_ib_md_release_device_mem(NULL); - ]])], - [ - AC_MSG_NOTICE([UCX device memory allocation is supported]) - AC_DEFINE([HAVE_UCX_DEVICE_MEM], [1], [Support for device memory allocation]) - sshmem_ucx_LIBS+=" $alloc_dm_LIBS" - sshmem_ucx_LDFLAGS+=" $alloc_dm_LDFLAGS" - ], - [ - AC_MSG_NOTICE([UCX device memory allocation is not supported]) - AC_DEFINE([HAVE_UCX_DEVICE_MEM], [0], [Support for device memory allocation]) - ]) - AC_LANG_POP([C]) + LDFLAGS+=" $sshmem_ucx_LDFLAGS" + LIBS+=" $sshmem_ucx_LIBS" CPPFLAGS="$save_CPPFLAGS" LDFLAGS="$save_LDFLAGS" @@ -66,4 +41,3 @@ AC_DEFUN([MCA_oshmem_sshmem_ucx_CONFIG],[ AC_SUBST([sshmem_ucx_LDFLAGS]) AC_SUBST([sshmem_ucx_LIBS]) ])dnl - diff --git a/oshmem/mca/sshmem/ucx/sshmem_ucx.h b/oshmem/mca/sshmem/ucx/sshmem_ucx.h index b6085374caa..90d41ac002c 100644 --- a/oshmem/mca/sshmem/ucx/sshmem_ucx.h +++ b/oshmem/mca/sshmem/ucx/sshmem_ucx.h @@ -35,7 +35,6 @@ OSHMEM_DECLSPEC extern mca_sshmem_ucx_component_t mca_sshmem_ucx_component; typedef struct mca_sshmem_ucx_segment_context { - void *dev_mem; sshmem_ucx_shadow_allocator_t *shadow_allocator; ucp_mem_h ucp_memh; } mca_sshmem_ucx_segment_context_t; diff --git a/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c b/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c index fa38d0693a0..688bfce6f19 100644 --- a/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c +++ b/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c @@ -26,13 +26,6 @@ #include "sshmem_ucx.h" -//#include - -#if HAVE_UCX_DEVICE_MEM -#include -#include -#endif - #define ALLOC_ELEM_SIZE sizeof(uint64_t) #define min(a,b) ((a) < (b) ? (a) : (b)) #define max(a,b) ((a) > (b) ? (a) : (b)) @@ -104,7 +97,7 @@ static segment_allocator_t sshmem_ucx_allocator = { static int segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, - unsigned flags, long hint, void *dev_mem) + unsigned flags, ucs_memory_type_t mem_type, int err_level) { mca_sshmem_ucx_segment_context_t *ctx; int rc = OSHMEM_SUCCESS; @@ -120,15 +113,20 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, mem_map_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | UCP_MEM_MAP_PARAM_FIELD_LENGTH | - UCP_MEM_MAP_PARAM_FIELD_FLAGS; + UCP_MEM_MAP_PARAM_FIELD_FLAGS | + UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE; - mem_map_params.address = address; - mem_map_params.length = size; - mem_map_params.flags = flags; + mem_map_params.address = address; + mem_map_params.length = size; + mem_map_params.flags = flags | + mca_spml_ucx_mem_map_flags_symmetric_rkey(spml); + mem_map_params.memory_type = mem_type; status = ucp_mem_map(spml->ucp_context, &mem_map_params, &mem_h); if (UCS_OK != status) { - SSHMEM_ERROR("ucp_mem_map() failed: %s\n", ucs_status_string(status)); + SSHMEM_VERBOSE(err_level, "ucp_mem_map(memory_type=%s) failed: %s\n", + ucs_memory_type_names[mem_type], + ucs_status_string(status)); rc = OSHMEM_ERROR; goto out; } @@ -161,12 +159,7 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); ds_buf->context = ctx; ds_buf->type = MAP_SEGMENT_ALLOC_UCX; - ds_buf->alloc_hints = hint; ctx->ucp_memh = mem_h; - ctx->dev_mem = dev_mem; - if (hint) { - ds_buf->allocator = &sshmem_ucx_allocator; - } out: OPAL_OUTPUT_VERBOSE( @@ -181,82 +174,37 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, return rc; } -#if HAVE_UCX_DEVICE_MEM -static uct_ib_device_mem_h alloc_device_mem(mca_spml_ucx_t *spml, size_t size, - void **address_p) -{ - uct_ib_device_mem_h dev_mem = NULL; - ucs_status_t status; - uct_md_h uct_md; - void *address; - size_t length; - - uct_md = ucp_context_find_tl_md(spml->ucp_context, "mlx5"); - if (uct_md == NULL) { - SSHMEM_VERBOSE(1, "ucp_context_find_tl_md() returned NULL\n"); - return NULL; - } - - /* If found a matching memory domain, allocate device memory on it */ - length = size; - address = NULL; - status = uct_ib_md_alloc_device_mem(uct_md, &length, &address, - UCT_MD_MEM_ACCESS_ALL, "sshmem_seg", - &dev_mem); - if (status != UCS_OK) { - /* If could not allocate device memory - fallback to mmap (since some - * PEs in the job may succeed and while others failed */ - SSHMEM_VERBOSE(1, "uct_ib_md_alloc_dm() failed: %s\n", - ucs_status_string(status)); - return NULL; - } - - SSHMEM_VERBOSE(3, "uct_ib_md_alloc_dm() returned address %p\n", address); - *address_p = address; - return dev_mem; -} -#endif - static int segment_create(map_segment_t *ds_buf, const char *file_name, size_t size, long hint) { mca_spml_ucx_t *spml = (mca_spml_ucx_t*)mca_spml.self; - unsigned flags; + unsigned flags = UCP_MEM_MAP_ALLOCATE; + int status; -#if HAVE_UCX_DEVICE_MEM - int ret = OSHMEM_ERROR; if (hint & SHMEM_HINT_DEVICE_NIC_MEM) { - if (size > UINT_MAX) { - return OSHMEM_ERR_BAD_PARAM; +#if HAVE_DECL_UCS_MEMORY_TYPE_RDMA + status = segment_create_internal(ds_buf, NULL, size, flags, + UCS_MEMORY_TYPE_RDMA, 3); + if (status == OSHMEM_SUCCESS) { + ds_buf->alloc_hints = hint; + ds_buf->allocator = &sshmem_ucx_allocator; + return OSHMEM_SUCCESS; } - - void *dev_mem_address; - uct_ib_device_mem_h dev_mem = alloc_device_mem(spml, size, - &dev_mem_address); - if (dev_mem != NULL) { - int ret; - ret = segment_create_internal(ds_buf, dev_mem_address, size, 0, - hint, dev_mem); - if (ret == OSHMEM_SUCCESS) { - return OSHMEM_SUCCESS; - } else if (dev_mem != NULL) { - uct_ib_md_release_device_mem(dev_mem); - /* fallback to regular allocation */ - } - } - } +#else + SSHMEM_VERBOSE(3, "DEVICE_NIC_MEM hint ignored since UCX does not " + "support MEMORY_TYPE_RDMA"); #endif + return OSHMEM_ERR_NOT_IMPLEMENTED; + } - flags = UCP_MEM_MAP_ALLOCATE | (spml->heap_reg_nb ? UCP_MEM_MAP_NONBLOCK : 0); - if (hint) { - return segment_create_internal(ds_buf, NULL, size, flags, hint, NULL); - } else { - return segment_create_internal(ds_buf, mca_sshmem_base_start_address, - size, flags | UCP_MEM_MAP_FIXED, hint, - NULL); + flags |= UCP_MEM_MAP_FIXED; + if (spml->heap_reg_nb) { + flags |= UCP_MEM_MAP_NONBLOCK; } + return segment_create_internal(ds_buf, mca_sshmem_base_start_address, size, + flags, UCS_MEMORY_TYPE_HOST, 0); } static void * @@ -303,12 +251,6 @@ segment_unlink(map_segment_t *ds_buf) ucp_mem_unmap(spml->ucp_context, ctx->ucp_memh); -#if HAVE_UCX_DEVICE_MEM - if (ctx->dev_mem) { - uct_ib_md_release_device_mem(ctx->dev_mem); - } -#endif - ds_buf->context = NULL; free(ctx);