From 0bb10c42bbafaaf35d9a0d3b9bad5724a00fc1c8 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 19 Dec 2022 16:51:55 +0100 Subject: [PATCH 01/15] coll/han: reorder free calls and avoid read-after-free in debug builds Inside the main loop, whenever we read a new string we free it first. The first iteration will be free(NULL), which is legal. At the end, we free all strings in all paths. This removes a potential read-after-free in a debug build and removes some calls to free from the error paths. Signed-off-by: Joseph Schuchart --- ompi/mca/coll/han/coll_han_dynamic_file.c | 38 ++++++++--------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/ompi/mca/coll/han/coll_han_dynamic_file.c b/ompi/mca/coll/han/coll_han_dynamic_file.c index c41cf6280fc..0500cb99a90 100644 --- a/ompi/mca/coll/han/coll_han_dynamic_file.c +++ b/ompi/mca/coll/han/coll_han_dynamic_file.c @@ -1,6 +1,6 @@ /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ /* - * Copyright (c) 2018-2020 The University of Tennessee and The University + * Copyright (c) 2018-2022 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2022 IBM Corporation. All rights reserved @@ -65,6 +65,7 @@ mca_coll_han_init_dynamic_rules(void) int algorithm_id; char * coll_name = NULL; char * algorithm_name = NULL; + char * target_comp_name = NULL; collective_rule_t *coll_rules; /* Topo information */ @@ -135,6 +136,7 @@ mca_coll_han_init_dynamic_rules(void) mca_coll_han_component.dynamic_rules.nb_collectives = i+1; /* Get the collective identifier */ + free(coll_name); if( getnext_string(fptr, &coll_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules invalid collective at line %d." @@ -155,9 +157,7 @@ mca_coll_han_init_dynamic_rules(void) coll_name, fileline, ALLGATHER, COLLCOUNT); goto file_reading_error; } - if( NULL != coll_name ) { - free(coll_name); - } + free(coll_name); coll_name = strdup(mca_coll_base_colltype_to_str(coll_id)); } @@ -321,7 +321,6 @@ mca_coll_han_init_dynamic_rules(void) /* Iterate on message size rules */ for( l = 0; l < nb_msg_size; l++ ) { - char* target_comp_name = NULL; conf_rules[k].nb_msg_size = l+1; /* Get the message size */ @@ -338,6 +337,7 @@ mca_coll_han_init_dynamic_rules(void) } /* Get the component identifier for this message size rule */ + free(target_comp_name); if( getnext_string(fptr, &target_comp_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " @@ -353,8 +353,6 @@ mca_coll_han_init_dynamic_rules(void) "reader encountered an unexpected EOF. Collective component id must be at " "least %d and less than %d\n", fname, fileline, target_comp_name, SELF, COMPONENTS_COUNT); - free(target_comp_name); - target_comp_name = NULL; goto file_reading_error; } @@ -362,13 +360,13 @@ mca_coll_han_init_dynamic_rules(void) algorithm_id = 0; // default for all collectives if ((component == HAN) && (1 == ompi_coll_base_file_peek_next_char_is(fptr, &fileline, '@')) ) { + free(algorithm_name); + algorithm_name = NULL; if( getnext_string(fptr, &algorithm_name) < 0 ) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " "at line %d: cannot read the name/id of an algorithm\n", fname, fileline); - free(target_comp_name); - target_comp_name = NULL; goto file_reading_error; } algorithm_id = mca_coll_han_algorithm_name_to_id(coll_id, algorithm_name); @@ -376,15 +374,11 @@ mca_coll_han_init_dynamic_rules(void) char *endp; algorithm_id = (int)strtol(algorithm_name, &endp, 10); char endc = *endp; - free(algorithm_name); - algorithm_name = NULL; if (('\0' != endc ) || !mca_coll_han_algorithm_id_is_valid(coll_id, algorithm_id)) { opal_output_verbose(5, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules found an error on dynamic rules file %s " "at line %d: unknown algorithm '%s' for %s\n", fname, fileline, algorithm_name, coll_name); - free(target_comp_name); - target_comp_name = NULL; goto file_reading_error; } } @@ -422,19 +416,13 @@ mca_coll_han_init_dynamic_rules(void) "file %s line %d found end of file while reading the optional list " "of segment lengths for collective %s component %s\n", fname, fileline, coll_name, target_comp_name); - free(target_comp_name); goto file_reading_error; } } } - free(target_comp_name); } } } - if( NULL != coll_name ) { - free(coll_name); - coll_name = NULL; - } } if( getnext_long(fptr, &nb_coll) > 0 ) { @@ -455,7 +443,9 @@ mca_coll_han_init_dynamic_rules(void) fclose(fptr); check_dynamic_rules(); + free(coll_name); free(algorithm_name); + free(target_comp_name); return OMPI_SUCCESS; cannot_allocate: @@ -465,10 +455,9 @@ mca_coll_han_init_dynamic_rules(void) opal_output_verbose(0, mca_coll_han_component.han_output, "coll:han:mca_coll_han_init_dynamic_rules " "cannot allocate dynamic rules\n"); - if( NULL != coll_name ) { - free(coll_name); - } + free(coll_name); free(algorithm_name); + free(target_comp_name); fclose (fptr); /* We disable the module, we don't need to keep the rules */ mca_coll_han_free_dynamic_rules(); @@ -481,10 +470,9 @@ mca_coll_han_init_dynamic_rules(void) "Will use mca parameters defined rules. " "To see error detail, please set " "collective verbosity level over 5\n"); - if( NULL != coll_name ) { - free(coll_name); - } + free(coll_name); free(algorithm_name); + free(target_comp_name); fclose (fptr); /* We disable the module, we don't need to keep the rules */ mca_coll_han_free_dynamic_rules(); From b192a785b2624cc19d7668bc0b8331046eb92a36 Mon Sep 17 00:00:00 2001 From: Roie Danino Date: Thu, 17 Aug 2023 09:58:57 +0300 Subject: [PATCH 02/15] SHMEM/MCA/SSHMEM/UCX: Fixing DEVICE_NIC_MEM support to use RDMA memory type Signed-off-by: Roie Danino Added a fallback for rdma allocation failure - allocating host memory instead Signed-off-by: Roie Danino --- config/ompi_check_ucx.m4 | 3 +- oshmem/mca/sshmem/ucx/configure.m4 | 30 +----- oshmem/mca/sshmem/ucx/sshmem_ucx.h | 1 - oshmem/mca/sshmem/ucx/sshmem_ucx_module.c | 117 ++++++---------------- 4 files changed, 33 insertions(+), 118 deletions(-) diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index 75aeb93e26e..fbea98cd7b3 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -107,7 +107,8 @@ AC_DEFUN([OMPI_CHECK_UCX],[ UCP_ATOMIC_FETCH_OP_FXOR, UCP_PARAM_FIELD_ESTIMATED_NUM_PPN, UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK, - UCP_OP_ATTR_FLAG_MULTI_SEND], + UCP_OP_ATTR_FLAG_MULTI_SEND, + UCS_MEMORY_TYPE_RDMA], [], [], [#include ]) AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS], diff --git a/oshmem/mca/sshmem/ucx/configure.m4 b/oshmem/mca/sshmem/ucx/configure.m4 index 4991c7557c0..7bb9038c5d0 100644 --- a/oshmem/mca/sshmem/ucx/configure.m4 +++ b/oshmem/mca/sshmem/ucx/configure.m4 @@ -28,34 +28,9 @@ AC_DEFUN([MCA_oshmem_sshmem_ucx_CONFIG],[ save_LIBS="$LIBS" save_CPPFLAGS="$CPPFLAGS" - alloc_dm_LDFLAGS=" -L$ompi_check_ucx_libdir/ucx" - alloc_dm_LIBS=" -luct_ib" CPPFLAGS+=" $sshmem_ucx_CPPFLAGS" - LDFLAGS+=" $sshmem_ucx_LDFLAGS $alloc_dm_LDFLAGS" - LIBS+=" $sshmem_ucx_LIBS $alloc_dm_LIBS" - - AC_LANG_PUSH([C]) - AC_LINK_IFELSE([AC_LANG_PROGRAM( - [[ - #include - #include - ]], - [[ - uct_md_h md = ucp_context_find_tl_md((ucp_context_h)NULL, ""); - (void)uct_ib_md_alloc_device_mem(md, NULL, NULL, 0, "", NULL); - uct_ib_md_release_device_mem(NULL); - ]])], - [ - AC_MSG_NOTICE([UCX device memory allocation is supported]) - AC_DEFINE([HAVE_UCX_DEVICE_MEM], [1], [Support for device memory allocation]) - sshmem_ucx_LIBS+=" $alloc_dm_LIBS" - sshmem_ucx_LDFLAGS+=" $alloc_dm_LDFLAGS" - ], - [ - AC_MSG_NOTICE([UCX device memory allocation is not supported]) - AC_DEFINE([HAVE_UCX_DEVICE_MEM], [0], [Support for device memory allocation]) - ]) - AC_LANG_POP([C]) + LDFLAGS+=" $sshmem_ucx_LDFLAGS" + LIBS+=" $sshmem_ucx_LIBS" CPPFLAGS="$save_CPPFLAGS" LDFLAGS="$save_LDFLAGS" @@ -66,4 +41,3 @@ AC_DEFUN([MCA_oshmem_sshmem_ucx_CONFIG],[ AC_SUBST([sshmem_ucx_LDFLAGS]) AC_SUBST([sshmem_ucx_LIBS]) ])dnl - diff --git a/oshmem/mca/sshmem/ucx/sshmem_ucx.h b/oshmem/mca/sshmem/ucx/sshmem_ucx.h index b6085374caa..90d41ac002c 100644 --- a/oshmem/mca/sshmem/ucx/sshmem_ucx.h +++ b/oshmem/mca/sshmem/ucx/sshmem_ucx.h @@ -35,7 +35,6 @@ OSHMEM_DECLSPEC extern mca_sshmem_ucx_component_t mca_sshmem_ucx_component; typedef struct mca_sshmem_ucx_segment_context { - void *dev_mem; sshmem_ucx_shadow_allocator_t *shadow_allocator; ucp_mem_h ucp_memh; } mca_sshmem_ucx_segment_context_t; diff --git a/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c b/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c index fa38d0693a0..262bef5ffe6 100644 --- a/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c +++ b/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c @@ -26,13 +26,6 @@ #include "sshmem_ucx.h" -//#include - -#if HAVE_UCX_DEVICE_MEM -#include -#include -#endif - #define ALLOC_ELEM_SIZE sizeof(uint64_t) #define min(a,b) ((a) < (b) ? (a) : (b)) #define max(a,b) ((a) > (b) ? (a) : (b)) @@ -104,7 +97,7 @@ static segment_allocator_t sshmem_ucx_allocator = { static int segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, - unsigned flags, long hint, void *dev_mem) + unsigned flags, ucs_memory_type_t mem_type, int err_level) { mca_sshmem_ucx_segment_context_t *ctx; int rc = OSHMEM_SUCCESS; @@ -120,15 +113,19 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, mem_map_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS | UCP_MEM_MAP_PARAM_FIELD_LENGTH | - UCP_MEM_MAP_PARAM_FIELD_FLAGS; + UCP_MEM_MAP_PARAM_FIELD_FLAGS | + UCP_MEM_MAP_PARAM_FIELD_MEMORY_TYPE; - mem_map_params.address = address; - mem_map_params.length = size; - mem_map_params.flags = flags; + mem_map_params.address = address; + mem_map_params.length = size; + mem_map_params.flags = flags; + mem_map_params.memory_type = mem_type; status = ucp_mem_map(spml->ucp_context, &mem_map_params, &mem_h); if (UCS_OK != status) { - SSHMEM_ERROR("ucp_mem_map() failed: %s\n", ucs_status_string(status)); + SSHMEM_VERBOSE(err_level, "ucp_mem_map(memory_type=%s) failed: %s\n", + ucs_memory_type_names[mem_type], + ucs_status_string(status)); rc = OSHMEM_ERROR; goto out; } @@ -161,12 +158,7 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size); ds_buf->context = ctx; ds_buf->type = MAP_SEGMENT_ALLOC_UCX; - ds_buf->alloc_hints = hint; ctx->ucp_memh = mem_h; - ctx->dev_mem = dev_mem; - if (hint) { - ds_buf->allocator = &sshmem_ucx_allocator; - } out: OPAL_OUTPUT_VERBOSE( @@ -181,82 +173,37 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, return rc; } -#if HAVE_UCX_DEVICE_MEM -static uct_ib_device_mem_h alloc_device_mem(mca_spml_ucx_t *spml, size_t size, - void **address_p) -{ - uct_ib_device_mem_h dev_mem = NULL; - ucs_status_t status; - uct_md_h uct_md; - void *address; - size_t length; - - uct_md = ucp_context_find_tl_md(spml->ucp_context, "mlx5"); - if (uct_md == NULL) { - SSHMEM_VERBOSE(1, "ucp_context_find_tl_md() returned NULL\n"); - return NULL; - } - - /* If found a matching memory domain, allocate device memory on it */ - length = size; - address = NULL; - status = uct_ib_md_alloc_device_mem(uct_md, &length, &address, - UCT_MD_MEM_ACCESS_ALL, "sshmem_seg", - &dev_mem); - if (status != UCS_OK) { - /* If could not allocate device memory - fallback to mmap (since some - * PEs in the job may succeed and while others failed */ - SSHMEM_VERBOSE(1, "uct_ib_md_alloc_dm() failed: %s\n", - ucs_status_string(status)); - return NULL; - } - - SSHMEM_VERBOSE(3, "uct_ib_md_alloc_dm() returned address %p\n", address); - *address_p = address; - return dev_mem; -} -#endif - static int segment_create(map_segment_t *ds_buf, const char *file_name, size_t size, long hint) { mca_spml_ucx_t *spml = (mca_spml_ucx_t*)mca_spml.self; - unsigned flags; + unsigned flags = UCP_MEM_MAP_ALLOCATE; + int status; -#if HAVE_UCX_DEVICE_MEM - int ret = OSHMEM_ERROR; if (hint & SHMEM_HINT_DEVICE_NIC_MEM) { - if (size > UINT_MAX) { - return OSHMEM_ERR_BAD_PARAM; +#if HAVE_DECL_UCS_MEMORY_TYPE_RDMA + status = segment_create_internal(ds_buf, NULL, size, flags, + UCS_MEMORY_TYPE_RDMA, 3); + if (status == OSHMEM_SUCCESS) { + ds_buf->alloc_hints = hint; + ds_buf->allocator = &sshmem_ucx_allocator; + return OSHMEM_SUCCESS; } - - void *dev_mem_address; - uct_ib_device_mem_h dev_mem = alloc_device_mem(spml, size, - &dev_mem_address); - if (dev_mem != NULL) { - int ret; - ret = segment_create_internal(ds_buf, dev_mem_address, size, 0, - hint, dev_mem); - if (ret == OSHMEM_SUCCESS) { - return OSHMEM_SUCCESS; - } else if (dev_mem != NULL) { - uct_ib_md_release_device_mem(dev_mem); - /* fallback to regular allocation */ - } - } - } +#else + SSHMEM_VERBOSE(3, "DEVICE_NIC_MEM hint ignored since UCX does not " + "support MEMORY_TYPE_RDMA"); #endif + return OSHMEM_ERR_NOT_IMPLEMENTED; + } - flags = UCP_MEM_MAP_ALLOCATE | (spml->heap_reg_nb ? UCP_MEM_MAP_NONBLOCK : 0); - if (hint) { - return segment_create_internal(ds_buf, NULL, size, flags, hint, NULL); - } else { - return segment_create_internal(ds_buf, mca_sshmem_base_start_address, - size, flags | UCP_MEM_MAP_FIXED, hint, - NULL); + flags |= UCP_MEM_MAP_FIXED; + if (spml->heap_reg_nb) { + flags |= UCP_MEM_MAP_NONBLOCK; } + return segment_create_internal(ds_buf, mca_sshmem_base_start_address, size, + flags, UCS_MEMORY_TYPE_HOST, 0); } static void * @@ -303,12 +250,6 @@ segment_unlink(map_segment_t *ds_buf) ucp_mem_unmap(spml->ucp_context, ctx->ucp_memh); -#if HAVE_UCX_DEVICE_MEM - if (ctx->dev_mem) { - uct_ib_md_release_device_mem(ctx->dev_mem); - } -#endif - ds_buf->context = NULL; free(ctx); From 7074e59ea93e8eb5edde53655bea35d658c286f9 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Thu, 7 Sep 2023 15:06:09 -0400 Subject: [PATCH 03/15] Replace use of writev by sendmsg This allows the TCP BTL to avoid raising SIGPIPE on OSes that do not support SO_NOSIGPIPE. Correctly use the unsigned type of the vpid when using it as a starting position for finding the process rank in a group. Signed-off-by: George Bosilca --- ompi/group/group.h | 12 +++++------- opal/mca/btl/tcp/btl_tcp_frag.c | 22 +++++++++++++++++----- opal/win32/opal_uio.c | 6 +++--- opal/win32/opal_uio.h | 14 +++++++------- 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/ompi/group/group.h b/ompi/group/group.h index 58251892015..c188e98f02f 100644 --- a/ompi/group/group.h +++ b/ompi/group/group.h @@ -3,7 +3,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2020 The University of Tennessee and The University + * Copyright (c) 2004-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -459,19 +459,17 @@ static inline struct ompi_proc_t *ompi_group_peer_lookup_existing (ompi_group_t */ static inline int ompi_group_proc_lookup_rank (ompi_group_t* group, ompi_proc_t* proc) { - int i, np, v; + int i, np, rank; + opal_vpid_t v; assert( NULL != proc ); assert( !ompi_proc_is_sentinel(proc) ); np = ompi_group_size(group); if( 0 == np ) return MPI_PROC_NULL; /* heuristic: On comm_world, start the lookup from v=vpid, so that - * when working on comm_world, the search is O(1); - * Otherwise, wild guess: start from a proportional position - * compared to comm_world position. */ + * when working on comm_world, on average, the search remains O(1). */ v = proc->super.proc_name.vpid; - v = (viov_ptr; + msg.msg_iovlen = frag->iov_cnt; + msg.msg_control = NULL; + msg.msg_controllen = 0; + + /* non-blocking write, continue if interrupted */ do { - cnt = writev(sd, frag->iov_ptr, frag->iov_cnt); + /* Use sendmsg to avoid issues with SIGPIPE as described in + * https://blog.erratasec.com/2018/10/tcpip-sockets-and-sigpipe.html# + */ + cnt = sendmsg(sd, &msg, msg_flags); if (cnt < 0) { switch (opal_socket_errno) { case EINTR: @@ -116,7 +128,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd) case EWOULDBLOCK: return false; case EFAULT: - BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %lu)\n\t%s(%lu)\n", + BTL_ERROR(("mca_btl_tcp_frag_send: sendmsg error (%p, %lu)\n\t%s(%lu)\n", frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len, strerror(opal_socket_errno), (unsigned long) frag->iov_cnt)); /* send_lock held by caller */ @@ -125,7 +137,7 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd) return false; default: BTL_PEER_ERROR(frag->endpoint->endpoint_proc->proc_opal, - ("mca_btl_tcp_frag_send: writev failed: %s (%d)", + ("mca_btl_tcp_frag_send: sendmsg failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno)); /* send_lock held by caller */ frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED; diff --git a/opal/win32/opal_uio.c b/opal/win32/opal_uio.c index 0270e0f4f7b..3c4bfe7550b 100644 --- a/opal/win32/opal_uio.c +++ b/opal/win32/opal_uio.c @@ -2,7 +2,7 @@ Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana University Research and Technology Corporation. All rights reserved. - Copyright (c) 2004-2005 The University of Tennessee and The University + Copyright (c) 2004-2023 The University of Tennessee and The University of Tennessee Research Foundation. All rights reserved. Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -26,12 +26,12 @@ of code to handle the windows error flags */ -int writev(int fd, struct iovec *iov, int cnt) +ssize_t sendmsg(int fd, const struct msghdr *message, int flags) { int err; DWORD sendlen; - err = WSASend((SOCKET) fd, &(iov->data), cnt, &sendlen, 0, NULL, NULL); + err = WSASendMsg((SOCKET) fd, message, flags, &sendlen, NULL, NULL); if (err < 0) { return err; diff --git a/opal/win32/opal_uio.h b/opal/win32/opal_uio.h index 2691b0bd3d4..642beda1128 100644 --- a/opal/win32/opal_uio.h +++ b/opal/win32/opal_uio.h @@ -2,7 +2,7 @@ * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana * University Research and Technology * Corporation. All rights reserved. - * Copyright (c) 2004-2014 The University of Tennessee and The University + * Copyright (c) 2004-2023 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, @@ -33,14 +33,14 @@ struct iovec { #define iov_len data.len BEGIN_C_DECLS + /* - * writev: - writev writes data to file descriptor fd, and from the buffers - described by iov. The number of buffers is specified by cnt. The - buffers are used in the order specified. Operates just like write - except that data is taken from iov instead of a contiguous buffer. + * sendmsg: + * writes data to a file descriptor. This is a convenience function to allow + * the TCP BTL to support Windows. Overall is should behave similarly to the + * POSIX sendmsg function. */ -OPAL_DECLSPEC int writev(int fd, struct iovec *iov, int cnt); +OPAL_DECLSPEC ssize_t sendmsg(int socket, const struct msghdr *message, int flags); /* readv reads data from file descriptor fd, and puts the result in the From cef772b4fbf3c5a9b9f6492e92cfabb3277fd5d3 Mon Sep 17 00:00:00 2001 From: George Bosilca Date: Fri, 22 Sep 2023 15:24:27 -0400 Subject: [PATCH 04/15] Use aggregate initialization to ensure all fields are set Removes complaints from coverity about msg.msg_flags not being set. For more information about this read the discussion on #11915. Signed-off-by: George Bosilca --- opal/mca/btl/tcp/btl_tcp_frag.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/opal/mca/btl/tcp/btl_tcp_frag.c b/opal/mca/btl/tcp/btl_tcp_frag.c index b70ad9e34eb..36c01537895 100644 --- a/opal/mca/btl/tcp/btl_tcp_frag.c +++ b/opal/mca/btl/tcp/btl_tcp_frag.c @@ -105,16 +105,11 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t *frag, int sd) { ssize_t cnt; size_t i, num_vecs; - struct msghdr msg; + struct msghdr msg = { + .msg_iov = frag->iov_ptr, + .msg_iovlen = frag->iov_cnt }; int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_iov = frag->iov_ptr; - msg.msg_iovlen = frag->iov_cnt; - msg.msg_control = NULL; - msg.msg_controllen = 0; - /* non-blocking write, continue if interrupted */ do { /* Use sendmsg to avoid issues with SIGPIPE as described in From da9206a4c1df7744d4cc05358336bf1aaff596f4 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Tue, 11 Jul 2023 12:02:54 -0400 Subject: [PATCH 05/15] mailmap: Add alternate email address for Jeff Squyres Signed-off-by: Jeff Squyres --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index e8516075720..b463497a038 100644 --- a/.mailmap +++ b/.mailmap @@ -32,6 +32,7 @@ Jeff Squyres Jeff Squyres --quiet <--quiet> Jeff Squyres +Jeff Squyres George Bosilca From ab7013787261ac93d0ea55ea94af00e07cd215a2 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sun, 24 Sep 2023 10:21:33 -0400 Subject: [PATCH 06/15] Update prrte submodule to include new RST functionality Signed-off-by: Jeff Squyres --- 3rd-party/prrte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rd-party/prrte b/3rd-party/prrte index 0347baa1eda..9015ca02cce 160000 --- a/3rd-party/prrte +++ b/3rd-party/prrte @@ -1 +1 @@ -Subproject commit 0347baa1edaec29c4f0cf1eac7b674ad7ba139c1 +Subproject commit 9015ca02cce72acc03f86d399f939843c42b3dc8 From 1fd09447f4234f15d2d8c14dd4efba7601b67b8d Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Tue, 11 Jul 2023 12:02:04 -0400 Subject: [PATCH 07/15] docs: slurp PRTE's RST files into mpirun.1 This commit introduce a fundamentally new concept: have configure search PRRTE for RST files to include in Open MPI's documentation (regardless of whether we're using the internal/bundled PRRTE or an external PRRTE). If we're building against an external PRRTE that is old enough that it doesn't have any RST files installed, we'll make up some dummy RST files that basically say "you don't get help/content here because your PRRTE is too old." To simplify the configury for this scheme, this commit also makes another change: the pre-built HTML docs and nroff man pages included in distribution tarballs are now located at docs/html/ and docs/man/, respectively (vs. the location where we'll build them: docs/_build/html/ and docs/_build/man/, respectively). There are two cases here: 1. If the user has Sphinx available, we'll build the docs under docs/_build/, and install those (effectively ignoring the pre-built docs). 2. If the user does not have Sphinx available, we'll just install the pre-built docs. This simplified things like "make clean" and "make distcheck". Including RST content from PRTE required another major change: when we build the RST docs in a VPATH scenario, we copy the entire docs/ source tree to the build tree. This allows us to modify the RST sources a bit (e.g., to include the PRRTE RST files or generate dummy PRRTE RST files). mpirun.1.rst is updated to include the RST content from PRRTE about CLI options. More work needs to be done here to remove old, now-redundant content. Finally, we also amend the advice to implementors to have Sphinx installed when building their package so that Open MPI's build system can properly slurp in their PRRTE's RST docs. Signed-off-by: Jeff Squyres --- .gitignore | 9 + Makefile.ompi-rules | 9 + config/ompi_setup_prrte.m4 | 59 +++- configure.ac | 3 +- docs/Makefile.am | 270 ++++++++++++--- docs/conf.py | 20 +- docs/index.rst | 8 +- docs/installing-open-mpi/packagers.rst | 42 +++ .../required-support-libraries.rst | 5 +- docs/man-openmpi/man1/mpirun.1.rst | 307 +++++++++++------- docs/news/news-v5.0.x.rst | 7 +- docs/no-prrte-content.rst.txt | 24 ++ 12 files changed, 590 insertions(+), 173 deletions(-) create mode 100644 docs/no-prrte-content.rst.txt diff --git a/.gitignore b/.gitignore index d15a1bc8f88..c1bfe01444a 100644 --- a/.gitignore +++ b/.gitignore @@ -534,3 +534,12 @@ docs/_templates # Common Python virtual environment directory names venv py?? + +# Copies of PRRTE RST files (i.e., not source controlled in this tree) +docs/prrte-rst-content +docs/schizo-ompi-rst-content + +# Copies of the built HTML docs and man pages (for distribution +# tarballs) +docs/html +docs/man diff --git a/Makefile.ompi-rules b/Makefile.ompi-rules index 567bcfd99f3..d18d49c4978 100644 --- a/Makefile.ompi-rules +++ b/Makefile.ompi-rules @@ -2,6 +2,7 @@ # Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2008 Sun Microsystems, Inc. All rights reserved. # Copyright (c) 2020 Intel, Inc. All rights reserved. +# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -26,6 +27,14 @@ OMPI_V_GEN = $(ompi__v_GEN_$V) ompi__v_GEN_ = $(ompi__v_GEN_$AM_DEFAULT_VERBOSITY) ompi__v_GEN_0 = @echo " GENERATE" $@; +OMPI_V_COPYALL = $(ompi__v_COPYALL_$V) +ompi__v_COPYALL_ = $(ompi__v_COPYALL_$AM_DEFAULT_VERBOSITY) +ompi__v_COPYALL_0 = @echo " COPY tree $@"; + +OMPI_V_SPHINX_COPYRST = $(ompi__v_SPHINX_COPYRST_$V) +ompi__v_SPHINX_COPYRST_ = $(ompi__v_SPHINX_COPYRST_$AM_DEFAULT_VERBOSITY) +ompi__v_SPHINX_COPYRST_0 = @echo " COPY RST source files"; + OMPI_V_SPHINX_HTML = $(ompi__v_SPHINX_HTML_$V) ompi__v_SPHINX_HTML_ = $(ompi__v_SPHINX_HTML_$AM_DEFAULT_VERBOSITY) ompi__v_SPHINX_HTML_0 = @echo " GENERATE HTML docs"; diff --git a/config/ompi_setup_prrte.m4 b/config/ompi_setup_prrte.m4 index 4dffa6ceb2a..97eba7a1bd2 100644 --- a/config/ompi_setup_prrte.m4 +++ b/config/ompi_setup_prrte.m4 @@ -19,6 +19,7 @@ dnl Copyright (c) 2019-2020 Intel, Inc. All rights reserved. dnl Copyright (c) 2020-2022 Amazon.com, Inc. or its affiliates. All Rights reserved. dnl Copyright (c) 2021 Nanook Consulting. All rights reserved. dnl Copyright (c) 2021-2022 IBM Corporation. All rights reserved. +dnl Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. dnl $COPYRIGHT$ dnl dnl Additional copyrights may follow @@ -35,10 +36,25 @@ dnl dnl A Makefile conditional OMPI_WANT_PRRTE will be defined based on the dnl results of the build. AC_DEFUN([OMPI_SETUP_PRRTE],[ - OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy]) + AC_REQUIRE([AC_PROG_LN_S]) + +OPAL_VAR_SCOPE_PUSH([prrte_setup_internal_happy prrte_setup_external_happy target_rst_dir]) opal_show_subtitle "Configuring PRRTE" + # We *must* have setup Sphinx before invoking this macro (i.e., it + # is a programming error -- not a run-time error -- if Sphinx was + # not previously setup). + OAC_ASSERT_BEFORE([OAC_SETUP_SPHINX], [OMPI_SETUP_PRRTE]) + + # These are sym links to folders with PRRTE's RST files that we'll + # slurp into mpirun.1.rst. We'll remove these links (or even + # accidental full copies) now and replace them with new links to + # the PRRTE that we find, below. + target_rst_dir="$OMPI_TOP_BUILDDIR/docs" + rm -rf "$target_rst_dir/prrte-rst-content" + rm -rf "$target_rst_dir/schizo-ompi-rst-content" + OPAL_3RDPARTY_WITH([prrte], [prrte], [package_prrte], [1]) AC_ARG_WITH([prrte-bindir], @@ -101,12 +117,15 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[ [$OMPI_USING_INTERNAL_PRRTE], [Whether or not we are using the internal PRRTE]) - OPAL_SUMMARY_ADD([Miscellaneous], [prrte], [], [$opal_prrte_mode]) + AC_SUBST(OMPI_PRRTE_RST_CONTENT_DIR) + AC_SUBST(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR) + AM_CONDITIONAL(OMPI_HAVE_PRRTE_RST, [test $OMPI_HAVE_PRRTE_RST -eq 1]) + + OPAL_SUMMARY_ADD([Miscellaneous], [PRRTE], [], [$opal_prrte_mode]) OPAL_VAR_SCOPE_POP ]) - dnl _OMPI_SETUP_PRRTE_INTERNAL([action-if-success], [action-if-not-success]) dnl dnl Attempt to configure the built-in PRRTE. @@ -220,7 +239,15 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_INTERNAL], [ [AC_MSG_ERROR([PRRTE configuration failed. Cannot continue.])]) AS_IF([test "$internal_prrte_happy" = "yes"], - [$1], [$2]) + [AC_MSG_CHECKING([for internal PRRTE RST files]) + AS_IF([test -n "$SPHINX_BUILD"], + [OMPI_HAVE_PRRTE_RST=1 + OMPI_PRRTE_RST_CONTENT_DIR="$OMPI_TOP_SRCDIR/3rd-party/prrte/src/docs/prrte-rst-content" + OMPI_SCHIZO_OMPI_RST_CONTENT_DIR="$OMPI_TOP_SRCDIR/3rd-party/prrte/src/mca/schizo/ompi" + AC_MSG_RESULT([found])], + [AC_MSG_RESULT([not found])]) + $1], + [$2]) OPAL_VAR_SCOPE_POP ]) @@ -284,9 +311,27 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [ [AC_DEFINE_UNQUOTED([OMPI_PRTERUN_PATH], ["${prterun_path}"], [Path to prterun])]) AS_IF([test "$setup_prrte_external_happy" = "yes"], - [$1], [$2]) + [ # Determine if this external PRRTE has installed the RST + # directories that we care about + + AC_MSG_CHECKING([for external PRRTE RST files]) + prrte_install_dir=${with_prrte}/share/prte/rst + AS_IF([test -n "$SPHINX_BUILD"], + [AS_IF([test -d "$prrte_install_dir/prrte-rst-content" && \ + test -d "$prrte_install_dir/schizo-ompi-rst-content"], + [OMPI_HAVE_PRRTE_RST=1 + OMPI_PRRTE_RST_CONTENT_DIR="$prrte_install_dir/prrte-rst-content" + OMPI_SCHIZO_OMPI_RST_CONTENT_DIR="$prrte_install_dir/schizo-ompi-rst-content" + AC_MSG_RESULT([found]) + ], + [ # This version of PRRTE doesn't have installed RST + # files. + AC_MSG_RESULT([not found]) + OMPI_HAVE_PRRTE_RST=0 + ]) + ]) + $1], + [$2]) OPAL_VAR_SCOPE_POP ]) - - diff --git a/configure.ac b/configure.ac index 7c3c3936c3b..f03bdaf268c 100644 --- a/configure.ac +++ b/configure.ac @@ -28,6 +28,7 @@ # Copyright (c) 2018 FUJITSU LIMITED. All rights reserved. # Copyright (c) 2019 Triad National Security, LLC. All rights # reserved. +# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -1072,7 +1073,7 @@ AS_IF([test -z "$LEX" || \ dnl Note that we have to double escape the URL below dnl so that the # it contains doesn't confuse the Autotools -OAC_SETUP_SPHINX([$srcdir/docs/_build/man/MPI_T.3], +OAC_SETUP_SPHINX([$srcdir/docs/man/MPI_T.3], [[https://docs.open-mpi.org/en/main/developers/prerequisites.html#sphinx-and-therefore-python]]) # diff --git a/docs/Makefile.am b/docs/Makefile.am index 3aa2b3b960f..dc9a085e99e 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -1,6 +1,7 @@ # # Copyright (c) 2022 Cisco Systems, Inc. All rights reserved. # +# Copyright (c) 2023 Jeffrey M. Squyres. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -26,7 +27,7 @@ .NOTPARALLEL: OUTDIR = _build -SPHINX_CONFIG = conf.py +SPHINX_CONFIG = $(srcdir)/conf.py SPHINX_OPTS ?= -W --keep-going -j auto # Note: it is significantly more convenient to list all the source @@ -58,6 +59,9 @@ RST_SOURCE_FILES = \ EXTRA_DIST = \ requirements.txt \ + no-prrte-content.rst.txt \ + html \ + man \ $(SPHINX_CONFIG) \ $(TEXT_SOURCE_FILES) \ $(IMAGE_SOURCE_FILES) \ @@ -784,27 +788,48 @@ OSHMEM_MAN3 = \ MAN_OUTDIR = $(OUTDIR)/man +# If we're building the docs, then we install from the just-built +# docs. Otherwise, we install from the pre-built docs (i.e., the docs +# included in the tarball). +# +# NOTE: If we're in a git clone with a) no pre-built docs and b) +# Sphinx is not found, then both OPAL_BUILD_DOCS and OPAL_INSTALL_DOCS +# will be false, and the value of MAN_INSTALL_FROM will not not used. +if OPAL_BUILD_DOCS +MAN_INSTALL_FROM = $(MAN_OUTDIR) +HTML_INSTALL_FROM = $(OUTDIR)/html +else +MAN_INSTALL_FROM = man +HTML_INSTALL_FROM = html +endif + +# For each of the man page macros below: +# +# *_RST: the .rst source files +# *_BUILT: the files in the _build/man directory +# *_INSTALL_FROM: the files in either the _build/man/ directory (if we +# are building the Sphinx docs) or the man/ directory (if we are not +# building the Sphinx docs, and are using the pre-built docs that +# are included in the tarballl). OMPI_MAN1_RST = $(OMPI_MAN1:%.1=man-openmpi/man1/%.1.rst) OMPI_MAN1_BUILT = $(OMPI_MAN1:%.1=$(MAN_OUTDIR)/%.1) +OMPI_MAN1_INSTALL_FROM = $(OMPI_MAN1:%.1=$(MAN_INSTALL_FROM)/%.1) OMPI_MAN3_RST = $(OMPI_MAN3:%.3=man-openmpi/man3/%.3.rst) OMPI_MAN3_BUILT = $(OMPI_MAN3:%.3=$(MAN_OUTDIR)/%.3) +OMPI_MAN3_INSTALL_FROM = $(OMPI_MAN3:%.3=$(MAN_INSTALL_FROM)/%.3) OMPI_MAN7_RST = $(OMPI_MAN7:%.7=man-openmpi/man7/%.7.rst) OMPI_MAN7_BUILT = $(OMPI_MAN7:%.7=$(MAN_OUTDIR)/%.7) +OMPI_MAN7_INSTALL_FROM = $(OMPI_MAN7:%.7=$(MAN_INSTALL_FROM)/%.7) OSHMEM_MAN1_RST = $(OSHMEM_MAN1:%.1=man-oshmem/man1/%.1.rst) OSHMEM_MAN1_BUILT = $(OSHMEM_MAN1:%.1=$(MAN_OUTDIR)/%.1) +OSHMEM_MAN1_INSTALL_FROM = $(OSHMEM_MAN1:%.1=$(MAN_INSTALL_FROM)/%.1) OSHMEM_MAN3_RST = $(OSHMEM_MAN3:%.3=man-oshmem/man3/%.3.rst) OSHMEM_MAN3_BUILT = $(OSHMEM_MAN3:%.3=$(MAN_OUTDIR)/%.3) - -EXTRA_DIST += \ - $(OMPI_MAN1_BUILT) \ - $(OMPI_MAN3_BUILT) \ - $(OMPI_MAN7_BUILT) \ - $(OSHMEM_MAN1_BUILT) \ - $(OSHMEM_MAN3_BUILT) +OSHMEM_MAN3_INSTALL_FROM = $(OSHMEM_MAN3:%.3=$(MAN_INSTALL_FROM)/%.3) ########################################################################### @@ -845,49 +870,202 @@ EXTRA_DIST += \ $(OSHMEM_MAN1_CXX_REDIRECTS) \ $(OSHMEM_MAN1_FORTRAN_REDIRECTS) + +########################################################################### + +ALL_MAN_BUILT = \ + $(OMPI_MAN1_BUILT) $(OMPI_MAN3_BUILT) $(OMPI_MAN7_BUILT) \ + $(OSHMEM_MAN1_BUILT) $(OSHMEM_MAN_3_BUILT) + +# These 2 targets are used in EXTRA_DIST: we make a full copy of the +# built HTML and man docs into a separate location that is included in +# the tarball. This gives users a fully copy of the docs included in +# distribution tarballs. +html: $(ALL_MAN_BUILT) + $(OMPI_V_COPYALL) rm -rf html; cp -rp $(OUTDIR)/html . + +man: $(ALL_MAN_BUILT) + $(OMPI_V_COPYALL) rm -rf man; cp -rp $(OUTDIR)/man . + +# Remove the copies of the built HTML and man pages to get back to a +# clean git clone. +maintainer-clean-local: + rm -rf html man + +# If we're doing a VPATH build, we may have "html" and "man" +# directories in the build tree (e.g., if we did "make dist"). Remove +# these copies so that we can pass distcheck (of course: we never +# remove these directories from the source tree). +distclean-local: + if test "$(srcdir)" != "$(builddir)"; then \ + rm -rf html man; \ + fi + ########################################################################### if OPAL_BUILD_DOCS include $(top_srcdir)/Makefile.ompi-rules -# Have to not list these targets in EXTRA_DIST outside of the -# OPAL_BUILD_DOCS conditional because "make dist" will fail due to -# these missing targets (and therefore not run the "dist-hook" target -# in the top-level Makefile, which prints a pretty message about why -# "make dist" failed). +# Copy over the PRRTE RST files to this build tree. # -# We list the entire directory trees (html and man) to grab all -# generated files in them. -EXTRA_DIST += \ - $(OUTDIR)/html \ - $(OUTDIR)/man +# 1. If we're building with PRRTE support: +# +# 1a. If we're building the internal/bundled PRRTE, then we'll copy +# the internal/bundled PRRTE's RST files to the build tree. +# 1b. If we're building against an external PRRTE installation that +# has RST files in its install tree, then we'll copy that +# external PRRTE's RST files to the build tree. +# 1c. If we're building against an external PRRTE installation that +# does NOT have RST files in its install tree, then we'll +# create some dummy RST files instead. +# +# 2. If we're building without PRRTE support, we'll create some dummy +# RST files instead. +# +# NOTE: We specifically list $(builddir) in the target name, just to +# ensure that "make" doesn't accidentally find this directory in the +# VPATH srcdir, and therefore not execute this rule (because Sphinx +# does not understand VPATH, and will ignore this directory in the +# VPATH srcdir). We can have this directory in the srcdir by doing a +# VPATH build of an official distribution tarball. -ALL_MAN_BUILT = \ - $(OMPI_MAN1_BUILT) $(OMPI_MAN3_BUILT) $(OMPI_MAN7_BUILT) \ - $(OSHMEM_MAN1_BUILT) $(OSHMEM_MAN_3_BUILT) +# Make the 2 directories that we need: schizo-ompi-rst-content and +# prrte-rst-content. +$(builddir)/schizo-ompi-rst-content: + $(OMPI_V_MKDIR) if test ! -d "$@"; then mkdir "$@"; fi +$(builddir)/prrte-rst-content: + $(OMPI_V_MKDIR) if test ! -d "$@"; then mkdir "$@"; fi + +# Get the schizo-ompi-rst-cli.rst file that we need. CAVEAT: we name +# it ".in" so that Sphinx doesn't slurp it in via two different +# locations in the RST docroot (i.e., via +# /schizo-ompi-rst-content/schizo-ompi-cli.rstxt and via +# /man-openmpi/man1/mpirun.1.rst). Sphinx *shouldn't* do this -- it +# should see the ".. include...." directive in mpirun.1.rst and *only* +# include the file once. But somehow it's also seeing it a 2nd time. +# So -- fine. We'll name it something other than .rst so that Sphinx +# doesn't do that. +# +# Regardless, either copy this file from the PRRTE install tree or +# make a bogus one (if we don't have one in the PRRTE install tree). +# +# Also, note: the rule to make the $(builddir)/schizo-ompi-rst-content +# directory must be in the AM_CONDITIONAL here, otherwise Automake +# complains. Meaning: we have to have same dependency listed in both +# the "if" and the "else" blocks. Grumble. +if OMPI_HAVE_PRRTE_RST +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(builddir)/schizo-ompi-rst-content +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR)/* + $(OMPI_V_SPHINX_COPYRST) \ + dir=`dirname $@`; \ + cp -rpf $(OMPI_SCHIZO_OMPI_RST_CONTENT_DIR)/* "$$dir" +else +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(builddir)/schizo-ompi-rst-content +$(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(srcdir)/no-prrte-content.rst.txt + if test ! -d "$$dir"; then mkdir "$$dir"; fi + $(OMPI_V_SPHINX_COPYRST) \ + dir=`dirname $@`; \ + cp -pf $(srcdir)/no-prrte-content.rst.txt "$$dir" +endif + +$(ALL_MAN_BUILT): $(builddir)/prrte-rst-content +$(ALL_MAN_BUILT): $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt $(ALL_MAN_BUILT): $(RST_SOURCE_FILES) $(IMAGE_SOURCE_FILES) $(ALL_MAN_BUILT): $(TEXT_SOURCE_FILES) $(SPHINX_CONFIG) +# Render the RST source into both 1) full HTML docs and 2) nroff man +# pages. +# # List both commands (HTML and man) in a single rule because they # really need to be run in serial. Specifically, if they were two # different rules and someone ran "make -j", then both of them could # be writing to $(OUTDIR)/doctrees simultaneously, which would be Bad. # Use one of the man pages as a sentinel file to indicate whether all # the HTML docs and man pages have been built. +# +# It's therefore a little bit of a lie to have the target named +# $(ALL_MAN_BUILT) *also* generate all the HTML content, but... so be +# it. +# +# Also note that Open MPI's RST includes some conditional RST (from +# PRRTE -- i.e., whether we get the source RST from the internal +# PRRTE, an external PRRTE, or whether we create RST files from +# scratch). These conditionals mean that we have to make some changes +# to the input Sphinx RST tree before building it. But -- by Automake +# convention -- we can't modify the source tree. Hence, we have to +# copy over all the source RST files -- including its internal +# directory structure -- to the build tree, and then make our desired +# changes here in the build tree. This is a bit ugly, but we could +# not think of anything better to do. +# +# NOTE: This is a little gross in that for a VPATH build, we *always* +# copy from the source tree to the dest tree (if the target does not +# exist or any of the sources in the source tree -- thanks to +# make/VPATH handling -- have changed compared to the target). +# However, we're using "cp -p", so even though we're copying *all the +# sources* from the source tree to the build tree, the timestamp will +# reflect what is in the source tree. Hence, if the source file has +# not changed, then it won't look like the file in the build tree has +# changed. We're going to overwrite any local changes in the build +# tree, but you shouldn't be editing the build tree, anyway. So -- +# good enough. +# +# Finally, one added wrinkle: only copy the RST source files in +# prrte-rst-content that are referenced by ".. include::" in the +# schizo-ompi-cli.rstxt file. We do this because Sphinx complains if +# there are .rst files that are not referenced. :-( $(ALL_MAN_BUILT): - $(OMPI_V_SPHINX_HTML) $(SPHINX_BUILD) -M html "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) - $(OMPI_V_SPHINX_MAN) $(SPHINX_BUILD) -M man "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(OMPI_V_SPHINX_COPYRST) if test "$(srcdir)" != "$(builddir)"; then \ + len=`echo "$(srcdir)/" | wc -c`; \ + for file in $(RST_SOURCE_FILES) $(IMAGE_SOURCE_FILES) $(TEXT_SOURCE_FILES) $(SPHINX_CONFIG); do \ + dir=`dirname $$file | cut -c$$len-`; \ + if test -z "$$dir"; then \ + dir=.; \ + fi; \ + if test ! -d "$$dir"; then \ + mkdir -p "$$dir"; \ + fi; \ + cp -p "$$file" "$$dir"; \ + done; \ + fi; \ + for file in `fgrep '.. include::' $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt | awk '{ print $$3 }'`; do \ + filename=`basename $$file`; \ + cp -pf $(OMPI_PRRTE_RST_CONTENT_DIR)/$$filename "$(builddir)/prrte-rst-content"; \ + done + $(OMPI_V_SPHINX_HTML) OMPI_VERSION_FILE=$(top_srcdir)/VERSION $(SPHINX_BUILD) -M html "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(OMPI_V_SPHINX_MAN) OMPI_VERSION_FILE=$(top_srcdir)/VERSION $(SPHINX_BUILD) -M man "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) # A useful rule to invoke manually to ensure that all of the external # HTML links we have are valid. Running this rule requires # connectivity to the general internet. linkcheck: - $(SPHINX_BUILD) -M linkcheck "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) + $(SPHINX_BUILD) -M linkcheck "$(builddir)" "$(OUTDIR)" $(SPHINX_OPTS) .PHONY: linkcheck -maintainer-clean-local: - $(SPHINX_BUILD) -M clean "$(srcdir)" "$(OUTDIR)" $(SPHINX_OPTS) +# Since we are building the docs, we built $(OUTDIR). Hence, we need +# to delete it during "make clean". Note that we can't add +# directories to CLEANFILES, because Automake only (effectively) does +# "rm -f $(CLEANFILES)" (not "rm -rf ..."). So we have to delete +# directories ourselves. +# +# Also, if this is a VPATH build, then we made a copy of a bunch of +# RST source files to the build tree. So delete all of those, too. +clean-local: + rm -rf $(OUTDIR) + rm -rf prrte-rst-content schizo-ompi-rst-content + if test "$(srcdir)" != "$(builddir)"; then \ + len=`echo "$(srcdir)/" | wc -c`; \ + for file in $(RST_SOURCE_FILES) $(IMAGE_SOURCE_FILES) $(TEXT_SOURCE_FILES) $(SPHINX_CONFIG); do \ + dir=`dirname $$file | cut -c$$len-`; \ + if test -z "$$dir"; then \ + rm -rf `basename $$file`; \ + fi; \ + if test -n "$$dir" && test -d "$$dir"; then \ + rm -rf "$$dir"; \ + fi; \ + done; \ + fi # List all the built man pages here in the Automake BUILT_SOURCES # macro. This hooks into the normal Automake build mechanisms, and @@ -901,7 +1079,7 @@ endif OPAL_BUILD_DOCS if OPAL_INSTALL_DOCS man1_MANS = \ - $(OMPI_MAN1_BUILT) \ + $(OMPI_MAN1_INSTALL_FROM) \ $(OMPI_MAN1_C_REDIRECTS) if OMPI_HAVE_CXX_COMPILER man1_MANS += $(OMPI_MAN1_CXX_REDIRECTS) @@ -913,12 +1091,12 @@ if OMPI_WANT_JAVA_BINDINGS man1_MANS += $(OMPI_MAN1_JAVA_REDIRECTS) endif -man3_MANS = $(OMPI_MAN3_BUILT) -man7_MANS = $(OMPI_MAN7_BUILT) +man3_MANS = $(OMPI_MAN3_INSTALL_FROM) +man7_MANS = $(OMPI_MAN7_INSTALL_FROM) if PROJECT_OSHMEM man1_MANS += \ - $(OSHMEM_MAN1_BUILT) \ + $(OSHMEM_MAN1_INSTALL_FROM) \ $(OSHMEM_MAN1_C_REDIRECTS) # There is no OSHMEM equivalent of this conditional; just use the OMPI # conditional. @@ -929,7 +1107,7 @@ if OSHMEM_BUILD_FORTRAN_BINDINGS man1_MANS += $(OSHMEM_MAN1_FORTRAN_REDIRECTS) endif -man3_MANS += $(OSHMEM_MAN3_BUILT) +man3_MANS += $(OSHMEM_MAN3_INSTALL_FROM) endif # We do not know the names of all the generated HTML files: we only @@ -945,19 +1123,29 @@ endif # Automake-provided install macros to set desirable permissions on the # target directories and files. # -# Since this might be a VPATH build, first check to see if _build/html -# exists in the source tree. If not, do the find+install from the -# build tree. +# Check to see if we actually built the docs. If we did, copy from +# the _build/html tree in the builddir. In all other cases, see if +# there's a _build/html in the source tree (e.g., if this is a build +# from a tarball that included a _build/html); if that exists, copy +# from that. +# +# NOTE: We can't use the AM_CONDITIONAL OPAL_BUILD_DOCS in the middle +# of a block that uses the shell continuation character at the end of +# each line. Instead, we check if $(SPHINX_BUILD) is non-empty, which +# is the test used to construct OPAL_BUILD_DOCS. install-data-hook: $(MKDIR_P) $(DESTDIR)$(docdir) - if test -d $(srcdir)/_build/html; then \ - topdir=$(srcdir)/_build; \ - else \ - topdir=_build; \ + topdir= ; \ + if test -n "$(SPHINX_BUILD)" && test -d $(builddir)/$(HTML_INSTALL_FROM); then \ + topdir="$(builddir)/$(HTML_INSTALL_FROM)"; \ + elif test -d $(srcdir)/$(HTML_INSTALL_FROM); then \ + topdir="$(srcdir)/$(HTML_INSTALL_FROM)"; \ fi; \ - cd $$topdir; \ - find html -type d -exec $(mkinstalldirs) $(DESTDIR)$(docdir)/{} \; ; \ - find html -type f -exec $(INSTALL_DATA) {} $(DESTDIR)$(docdir)/{} \; + if test -n "$$topdir"; then \ + cd $$topdir/..; \ + find html -type d -exec $(mkinstalldirs) $(DESTDIR)$(docdir)/{} \; ; \ + find html -type f -exec $(INSTALL_DATA) {} $(DESTDIR)$(docdir)/{} \; ; \ + fi uninstall-hook: rm -rf $(DESTDIR)$(docdir) diff --git a/docs/conf.py b/docs/conf.py index bf192f5356b..b8b7e8c4690 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -10,9 +10,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) +import os # -- Project information ----------------------------------------------------- @@ -24,8 +22,20 @@ author = 'The Open MPI Community' # The full version, including alpha/beta/rc tags -# Read the Open MPI version from the VERSION file -with open("../VERSION") as fp: +# Read the Open MPI version from the VERSION file in the source tree +# The docs/Makefile.am will set the env var OMPI_VERSION_FILE, because +# we might be doing a VPATH build. +filename = None +if 'OMPI_VERSION_FILE' in os.environ: + filename = os.environ['OMPI_VERSION_FILE'] +elif os.path.exists("../VERSION"): + filename = '../VERSION' + +if filename is None: + print("ERROR: Could not find Open MPI source tree VERSION file") + exit(1) + +with open(filename) as fp: ompi_lines = fp.readlines() ompi_data = dict() diff --git a/docs/index.rst b/docs/index.rst index a1f7d0b6d2f..c339c213622 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -28,9 +28,13 @@ Documentation for Open MPI can be found in the following locations: * - v5.0.0 and later - Web: https://docs.open-mpi.org/ - Tarball: ``docs/_build/html/index.html`` + Included in tarball: ``docs/html/index.html`` - Installed: ``$prefix/share/doc/openmpi/html/index.html`` + Built in source tree (if Sphinx available): ``docs/_build/html/index.html`` + + Installed: ``$docdir/html/index.html`` + + (which defaults to: ``$prefix/share/doc/openmpi/html/index.html``) * - v4.1.x and earlier - See the `legacy Open MPI FAQ `_ diff --git a/docs/installing-open-mpi/packagers.rst b/docs/installing-open-mpi/packagers.rst index 6435abded08..e43d52b101a 100644 --- a/docs/installing-open-mpi/packagers.rst +++ b/docs/installing-open-mpi/packagers.rst @@ -1,3 +1,5 @@ +.. _label-install-packagers: + Advice for packagers ==================== @@ -20,9 +22,26 @@ the following: .. code-block:: sh + # Install Sphinx so that Open MPI can re-build its docs with the + # installed PRRTE's docs + + virtualalenv venv + . ./venv/bin/activate + pip install docs/requirements.txt + ./configure --with-libevent=external --with-hwloc=external \ --with-pmix=external --with-prrte=external ... +.. important:: Note the installation of the Sphinx tool so that Open + MPI can re-build its documentation with the external + PRRTE's documentation. + + Failure to do this will mean Open MPI's documentation + will be correct for the version of PRRTE that is + bundled in the Open MPI distribution, but may not be + entirely correct for the version of PRRTE that you are + building against. + The ``external`` keywords will force Open MPI's ``configure`` to ignore all the bundled libraries and only look for external versions of these support libraries. This also has the benefit of causing @@ -36,6 +55,29 @@ independently-built and installed versions. information about the required support library ``--with-FOO`` command line options. +Have Sphinx installed +--------------------- + +Since you should be (will be) installing Open MPI against an external +PRRTE and PMIx, you should have `Sphinx +`_ installed before running Open MPI's +``configure`` script. + +This will allow Open MPI to (re-)build its documentation according to +the PMIx and PRRTE that you are building against. + +To be clear: the Open MPI distribution tarball comes with pre-built +documentation |mdash| rendered in HTML and nroff |mdash| that is +suitable for the versions of PRRTE and PMIx that are bundled in that +tarball. + +However, if you are building Open MPI against not-bundled versions of +PRRTE / PMIx (as all packagers should be), Open MPI needs to re-build +its documentation with specific information from those external PRRTE +/ PMIx installs. For that, you need to have Sphinx installed before +running Open MPI's ``configure`` script. + + .. _label-install-packagers-dso-or-not: Components ("plugins"): DSO or no? diff --git a/docs/installing-open-mpi/required-support-libraries.rst b/docs/installing-open-mpi/required-support-libraries.rst index 9e02297998b..b411e1a02f5 100644 --- a/docs/installing-open-mpi/required-support-libraries.rst +++ b/docs/installing-open-mpi/required-support-libraries.rst @@ -399,6 +399,5 @@ Open MPI package should not include Hwloc, Libevent, PMIx, or PRRTE. Instead, it should depend on external, independently-built versions of these packages. -See the :ref:`Advice for packagers -` section for more -details. +See the :ref:`Advice for packagers ` section +for more details. diff --git a/docs/man-openmpi/man1/mpirun.1.rst b/docs/man-openmpi/man1/mpirun.1.rst index 66a0e75c269..c9168b60076 100644 --- a/docs/man-openmpi/man1/mpirun.1.rst +++ b/docs/man-openmpi/man1/mpirun.1.rst @@ -60,15 +60,17 @@ probably want to use a command line of the following form: This will run ``X`` copies of ```` in your current run-time environment (if running under a supported resource manager, Open MPI's -mpirun will usually automatically use the corresponding resource -manager process starter, as opposed to, for example, ``rsh`` or ``ssh``, which -require the use of a hostfile, or will default to running all ``X`` copies -on the localhost), scheduling (by default) in a round-robin fashion by -CPU slot. See the rest of this page for more details. - -Please note that mpirun automatically binds processes as of the start -of the v1.8 series. Three binding patterns are used in the absence of -any further directives (See :ref:`map/rank/bind defaults ` for more details): +``mpirun`` will usually automatically use the corresponding resource +manager process starter, as opposed to ``ssh`` (for example), which +require the use of a hostfile, or will default to running all ``X`` +copies on the localhost), scheduling (by default) in a round-robin +fashion by CPU slot. See the rest of this documentation for more +details. + +Please note that ``mpirun`` automatically binds processes to hardware +resources. Three binding patterns are used in the absence of any +further directives (See :ref:`map/rank/bind defaults +` for more details): * **Bind to core**: when the number of processes is <= 2 * **Bind to package**: when the number of processes is > 2 @@ -79,103 +81,43 @@ that you are either not bound at all (by specifying ``--bind-to none``), or bound to multiple cores using an appropriate binding level or specific number of processing elements per application process. -.. _man1-mpirun-definition-of-slot: - -DEFINITION OF 'SLOT' --------------------- - -The term "slot" is used extensively in the rest of this manual page. -A slot is an allocation unit for a process. The number of slots on a -node indicate how many processes can potentially execute on that node. -By default, Open MPI will allow one process per slot. - -If Open MPI is not explicitly told how many slots are available on a -node (e.g., if a hostfile is used and the number of slots is not -specified for a given node), it will determine a maximum number of -slots for that node in one of two ways: - -#. Default behavior: By default, Open MPI will attempt to discover the - number of processor cores on the node, and use that as the number - of slots available. - -#. When ``--use-hwthread-cpus`` is used: If ``--use-hwthread-cpus`` is - specified on the ``mpirun`` command line, then Open MPI will attempt to - discover the number of hardware threads on the node, and use that - as the number of slots available. - -This default behavior also occurs when specifying the ``--host`` -option with a single host. Thus, the command: - -.. code:: sh - - shell$ mpirun --host node1 ./a.out - -launches a number of processes equal to the number of cores on node -``node1``, whereas: - -.. code:: sh - - shell$ mpirun --host node1 --use-hwthread-cpus ./a.out - -launches a number of processes equal to the number of hardware -threads on ``node1``. - -When Open MPI applications are invoked in an environment managed by a -resource manager (e.g., inside of a Slurm job), and Open MPI was built -with appropriate support for that resource manager, then Open MPI will -be informed of the number of slots for each node by the resource -manager. For example: - -.. code:: sh - - shell$ mpirun ./a.out - -launches one process for every slot (on every node) as dictated by -the resource manager job specification. - -Also note that the one-process-per-slot restriction can be overridden -in unmanaged environments (e.g., when using hostfiles without a -resource manager) if oversubscription is enabled (by default, it is -disabled). Most MPI applications and HPC environments do not -oversubscribe; for simplicity, the majority of this documentation -assumes that oversubscription is not enabled. - -Slots are not hardware resources -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +OPEN MPI'S USE OF PRRTE +----------------------- -Slots are frequently incorrectly conflated with hardware resources. -It is important to realize that slots are an entirely different metric -than the number (and type) of hardware resources available. +Open MPI uses the PMIx Reference Runtime Environment (PRRTE) as the +main engine for launching, monitoring, and terminating MPI processes. -Here are some examples that may help illustrate the difference: +Much of the documentation below is directly imported from PRRTE. As +such, it frequently refers to PRRTE concepts and command line options. +Except where noted, these concepts and command line argument are all +applicable to Open MPI as well. Open MPI extends the available PRRTE +command line options, and also slightly modifies the PRRTE's default +behaviors in a few cases. These will be specifically described in the +docuemtnation below. -#. More processor cores than slots: Consider a resource manager job - environment that tells Open MPI that there is a single node with 20 - processor cores and 2 slots available. By default, Open MPI will - only let you run up to 2 processes. - - Meaning: you run out of slots long before you run out of processor - cores. +COMMAND LINE OPTIONS +-------------------- -#. More slots than processor cores: Consider a hostfile with a single - node listed with a ``slots=50`` qualification. The node has 20 - processor cores. By default, Open MPI will let you run up to 50 - processes. +The core of Open MPI's ``mpirun`` processing is performed via the +`PRRTE `_. Specifically: ``mpirun`` is +effectively a wrapper around ``prterun``, but ``mpirun``'s CLI options +are slightly different than PRRTE's CLI commands. - Meaning: you can run many more processes than you have processor - cores. +.. include:: /schizo-ompi-rst-content/schizo-ompi-cli.rstxt -.. _man1-mpirun-definition-of-processor-element: +OPTIONS (OLD / HARD-CODED CONTENT -- TO BE AUDITED +-------------------------------------------------- -DEFINITION OF 'PROCESSOR ELEMENT' ---------------------------------- +.. admonition:: This is old content + :class: error -By default, Open MPI defines that a "processing element" is a -processor core. However, if ``--use-hwthread-cpus`` is specified on the -mpirun command line, then a "processing element" is a hardware thread. + This is the old section of manually hard-coded content. It should + probably be read / audited and see what we want to keep and what we + want to discard. -OPTIONS -------- + Feel free to refer to https://docs.prrte.org/ rather than + replicating content here (e.g., for the definition of a slot and + other things). mpirun will send the name of the directory where it was invoked on the local node to each of the remote nodes, and attempt to change to that @@ -251,10 +193,11 @@ processes will be bound to the package. context. If no value is provided for the number of copies to execute (i.e., neither the ``-n`` nor its synonyms are provided on the command line), Open MPI will automatically execute a copy of the - program on each process slot (see :ref:`defintion of slot ` for description of a - "process slot"). This feature, however, can only be used in the SPMD - model and will return an error (without beginning execution of the - application) otherwise. + program on each process slot (see PRRTE's `defintion of "slot" + `_ + for description of a "process slot"). This feature, however, can + only be used in the SPMD model and will return an error (without + beginning execution of the application) otherwise. .. note:: The ``-n`` option is the preferred option to be used to specify the number of copies of the program to be executed, but the alternate @@ -280,7 +223,7 @@ To map processes: * ``--map-by ``: Map to the specified object, defaults to ``package``. Supported options include ``slot``, ``hwthread``, ``core``, ``L1cache``, ``L2cache``, ``L3cache``, ``package``, ``numa``, - ``node``, ``seq``, ``rankfile``, ``pe-list=#``, and ``ppr``. + ``node``, ``seq``, ``rankfile``, ``pe-list=#``, and ``ppr``. Any object can include modifiers by adding a ``:`` and any combination of the following: @@ -561,13 +504,17 @@ There are also other options: Note that if a number of slots is not provided to Open MPI (e.g., via the ``slots`` keyword in a hostfile or from a resource manager such as Slurm), the use of this option changes the default - calculation of number of slots on a node. See the :ref:`DEFINITION - OF 'SLOT' ` section. + calculation of number of slots on a node. See the PRRTE's + `defintion of "slot" + `_ + for more details. Also note that the use of this option changes the Open MPI's definition of a "processor element" from a processor core to a - hardware thread. See the :ref:`DEFINITION OF 'PROCESSOR ELEMENT' - ` section. + hardware thread. See + PRRTE's `defintion of a "processor element" + `_ + for more details. The following options are useful for developers; they are not generally useful to most Open MPI users: @@ -601,11 +548,23 @@ There may be other options listed with ``mpirun --help``. Environment Variables ^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + * ``MPIEXEC_TIMEOUT``: Synonym for the ``--timeout`` command line option. DESCRIPTION ----------- +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + One invocation of ``mpirun`` starts an MPI application running under Open MPI. If the application is single process multiple data (SPMD), the application can be specified on the ``mpirun`` command line. @@ -630,6 +589,12 @@ while others are specific to a single program (e.g., ``-n``). Specifying Host Nodes ^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Host nodes can be identified on the ``mpirun`` command line with the ``--host`` option or in a hostfile. @@ -679,6 +644,12 @@ from the resource manager. Specifying Number of Processes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + As we have just seen, the number of processes to run can be set using the hostfile. Other mechanisms exist. @@ -733,6 +704,12 @@ the ``-n`` option indicated that only 6 processes should be launched. Mapping Processes to Nodes: Using Policies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The examples above illustrate the default mapping of process processes to nodes. This mapping can also be controlled with various ``mpirun`` options that describe mapping policies. @@ -845,6 +822,12 @@ and 2 each running uptime on nodes ``bb`` and ``cc``, respectively. Mapping, Ranking, and Binding: Oh My! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Open MPI employs a three-phase procedure for assigning process locations and ranks: @@ -934,7 +917,7 @@ Alternatively, processes can be mapped and bound to specified cores using the ``--map-by pe-list=`` option. For example, ``--map-by pe-list=0,2,5`` will map three processes all three of which will be bound to logical cores ``0,2,5``. If you intend to bind each of the three processes to different -cores then the ``:ordered`` qualifier can be used like +cores then the ``:ordered`` qualifier can be used like ``--map-by pe-list=0,2,5:ordered``. In this example, the first process on a node will be bound to CPU 0, the second process on the node will be bound to CPU 2, and the third process on the node will be bound to @@ -992,7 +975,7 @@ in ranking when the ``span`` qualifier is used instead of the default. In the above case, the output shows us that 2 cores have been bound per process. Specifically, the mapping by ``slot`` with the ``PE=2`` qualifier indicated that each slot (i.e., process) should consume two processor -elements. By default, Open MPI defines "processor element" as "core", +elements. By default, Open MPI defines "processor element" as "core", and therefore the ``--bind-to core`` caused each process to be bound to both of the cores to which it was mapped. @@ -1030,16 +1013,16 @@ MCA parameters can be set not only on the mpirun command line, but alternatively in a system or user ``mca-params.conf`` file or as environment variables, as described in the :ref:`Setting MCA Parameters `. These are MCA parameters for -the PRRTE runtime so the command line argument ``--PRRTEmca`` must be used to +the PRRTE runtime so the command line argument ``--PRRTEmca`` must be used to pass the MCA parameter key/value pair. Alternatively, the MCA parameter key/ -value pair may be specific on the command line by prefixing the key with +value pair may be specific on the command line by prefixing the key with ``PRRTE_MCA_``. Some examples include: .. list-table:: :header-rows: 1 * - Option - - PRRTE MCA parameter key + - PRRTE MCA parameter key - Value * - ``--map-by core`` @@ -1071,6 +1054,12 @@ value pair may be specific on the command line by prefixing the key with Defaults for Mapping, Ranking, and Binding ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + If the user does not specify each of ``--map-by``, ``--rank-by``, and ``--bind-to`` option then the default values are as follows: * If no options are specified then @@ -1167,6 +1156,12 @@ The mapping pattern might be better seen if we change the default ``--rank-by`` Rankfiles ^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Rankfiles are text files that specify detailed information about how individual processes should be mapped to nodes, and to which processor(s) they should be bound. Each line of a rankfile specifies @@ -1226,6 +1221,12 @@ indexes of package and cores. Application Context or Executable Program? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + To distinguish the two different forms, mpirun looks on the command line for ``--app`` option. If it is specified, then the file named on the command line is assumed to be an application context. If it is @@ -1234,6 +1235,12 @@ not specified, then the file is assumed to be an executable program. Locating Files ^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + If no relative or absolute path is specified for a file, Open MPI will first look for files by searching the directories specified by the ``--path`` option. If there is no ``--path`` option set or if the @@ -1252,6 +1259,12 @@ current working directory from the invocation of ``mpirun``. Current Working Directory ^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The ``--wdir`` ``mpirun`` option (and its synonym, ``--wd``) allows the user to change to an arbitrary directory before the program is invoked. It can also be used in application context files to specify @@ -1279,6 +1292,12 @@ does not wait until :ref:`MPI_INIT(3) ` is called. Standard I/O ^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Open MPI directs UNIX standard input to ``/dev/null`` on all processes except the MPI_COMM_WORLD rank 0 process. The MPI_COMM_WORLD rank 0 process inherits standard input from ``mpirun``. @@ -1309,6 +1328,12 @@ will be collected into the ``my_output`` file. Signal Propagation ^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + When ``mpirun`` receives a SIGTERM and SIGINT, it will attempt to kill the entire job by sending all processes in the job a SIGTERM, waiting a small number of seconds, then sending all processes in the job a @@ -1326,6 +1351,12 @@ Other signals are not currently propagated by ``mpirun``. Process Termination / Signal Handling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + During the run of an MPI application, if any process dies abnormally (either exiting before invoking :ref:`MPI_FINALIZE(3) `, or dying as the result of a signal), ``mpirun`` will print out an @@ -1346,6 +1377,12 @@ safest) for the user to only clean up non-MPI state. Process Environment ^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Processes in the MPI application inherit their environment from the PRRTE daemon upon the node on which they are running. The environment is typically inherited from the user's shell. On remote @@ -1365,6 +1402,12 @@ for more details. Remote Execution ^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Open MPI requires that the ``PATH`` environment variable be set to find executables on remote nodes (this is typically only necessary in rsh- or ssh-based environments |mdash| batch/scheduled environments @@ -1431,6 +1474,12 @@ is equivalent to Exported Environment Variables ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + All environment variables that are named in the form ``OMPI_*`` will automatically be exported to new processes on the local and remote nodes. Environmental parameters can also be set/forwarded to the new @@ -1448,6 +1497,12 @@ them. Setting MCA Parameters ^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The ``--mca`` switch allows the passing of parameters to various MCA (Modular Component Architecture) modules. MCA modules have direct impact on MPI programs because they allow tunable parameters to be set @@ -1508,6 +1563,12 @@ page for detailed information on this command. Setting MCA parameters and environment variables from file ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + The ``--tune`` command line option and its synonym ``--mca`` ``mca_base_envar_file_prefix`` allows a user to set MCA parameters and environment variables with the syntax described below. This option @@ -1532,6 +1593,12 @@ have higher precedence than variables specified in the file. Running as root ^^^^^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + .. warning:: The Open MPI team **strongly** advises against executing ``mpirun`` as the root user. MPI applications should be run as regular (non-root) users. @@ -1558,6 +1625,12 @@ against this behavior. Exit status ^^^^^^^^^^^ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + There is no standard definition for what ``mpirun`` should return as an exit status. After considerable discussion, we settled on the following method for assigning the ``mpirun`` exit status (note: in @@ -1599,6 +1672,12 @@ bullet points above). EXAMPLES -------- +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + Be sure also to see the examples throughout the sections above. .. code:: sh @@ -1613,6 +1692,12 @@ messages. RETURN VALUE ------------ +.. admonition:: This is old, hard-coded content + :class: error + + Is this content still current / accurate? Should it be updated and + retained, or removed? + ``mpirun`` returns 0 if all processes started by mpirun exit after calling :ref:`MPI_FINALIZE(3) `. A non-zero value is returned if an internal error occurred in mpirun, or one or more diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index bd1bc89ad57..749dfa1c7bf 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -154,9 +154,10 @@ Open MPI version 5.0.0rc12 - Many MPI one-sided and RDMA emulation fixes for the ``tcp`` BTL. - - This patch series fixs many issues when running with ``--mca - osc rdma --mca btl tcp``, i.e., TCP support for one sided - MPI calls. + This patch series fixs many issues when running with ``--mca + osc rdma --mca btl tcp``, i.e., TCP support for one sided + MPI calls. + - Many MPI one-sided fixes for the ``uct`` BTL. - Added support for ``acc_single_intrinsic`` to the one-sided ``ucx`` component. diff --git a/docs/no-prrte-content.rst.txt b/docs/no-prrte-content.rst.txt new file mode 100644 index 00000000000..ea034952d31 --- /dev/null +++ b/docs/no-prrte-content.rst.txt @@ -0,0 +1,24 @@ +.. This file is only used in certain cases. Hence, the original file + in the Open MPI "docs" source tree ends in ".txt", so that Sphinx + will not complain if it is not used. If it *is* used, it is copied + to another file (that ends in ".rst") so that it can be properly + found / used by Sphinx. + +No content +^^^^^^^^^^ + +There is no meaningful content in this file because Open MPI was either: + +* Built without PRRTE support. + +* Built with a PRRTE that was too old to include machine-readable + documentation that could be incorporated into Open MPI's + documentation. + +If you build Open MPI with a newer version of PRRTE (and have the +Sphinx tool available when you run Open MPI's ``configure`` command), +you should get more meaningful documentation here. + +Hence, there is no documentation for this section. + +Sorry! From c3569811ec7cb518bb202b7d2fbeb32b238ca3c2 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sun, 10 Sep 2023 20:14:37 -0400 Subject: [PATCH 08/15] ReadTheDocs CI builds updates Since RTD doesn't run autogen, configure, or make, we now have to manually copy a few RST files from the embedded PRRTE to the docs/ tree before RTD invokes Sphinx. Signed-off-by: Jeff Squyres --- .readthedocs-pre-create-environment.sh | 36 ++++++++++++++++++++++++++ .readthedocs.yaml | 8 ++++++ 2 files changed, 44 insertions(+) create mode 100755 .readthedocs-pre-create-environment.sh diff --git a/.readthedocs-pre-create-environment.sh b/.readthedocs-pre-create-environment.sh new file mode 100755 index 00000000000..2709b822b80 --- /dev/null +++ b/.readthedocs-pre-create-environment.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -euxo pipefail + +# The ReadTheDocs build process does not run autogen/configure/make. +# Hence, we have to copy the PRRTE RST files (from the 3rd-party/prrte +# tree) to our docs/ tree manually. + +# Ensure that we're in the RTD CI environment + +if [[ "${READTHEDOCS:-no}" == "no" ]]; then + echo "This script is only intended to be run in the ReadTheDocs CI environment" + exit 1 +fi + +SCHIZO_SRC_DIR=3rd-party/prrte/src/mca/schizo/ompi +SCHIZO_TARGET_DIR=docs/schizo-ompi-rst-content + +PRRTE_RST_SRC_DIR=3rd-party/prrte/src/docs/prrte-rst-content +PRRTE_RST_TARGET_DIR=docs/prrte-rst-content + +# Copy the OMPI schizo file from PRRTE + +cp -rp $SCHIZO_SRC_DIR $SCHIZO_TARGET_DIR + +# Only copy the PRRTE RST source files in prrte-rst-content that are +# referenced by ".. include::" in the schizo-ompi-cli.rst file. We do +# this because Sphinx complains if there are .rst files that are not +# referenced. :-( + +mkdir -p $PRRTE_RST_TARGET_DIR +files=`fgrep '.. include::' $SCHIZO_TARGET_DIR/schizo-ompi-cli.rstxt | awk '{ print $3 }'` +for file in $files; do + filename=`basename $file` + cp -pf $PRRTE_RST_SRC_DIR/$filename $PRRTE_RST_TARGET_DIR +done diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 44e0bbac5a7..2ba1fc07842 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -12,6 +12,11 @@ build: os: ubuntu-22.04 tools: python: "3.10" + jobs: + # RTD doesn't run configure or make. So we have to manually copy + # in the PRRTE RST files to docs/. + pre_create_environment: + - ./.readthedocs-pre-create-environment.sh python: install: @@ -21,3 +26,6 @@ python: sphinx: configuration: docs/conf.py fail_on_warning: true + +submodules: + include: all From e1f2eafd846349d26eab457ecd60f19590400ba9 Mon Sep 17 00:00:00 2001 From: Wenduo Wang Date: Mon, 11 Sep 2023 23:19:41 +0000 Subject: [PATCH 09/15] opal: deprecate dead code Remove unused code. The logic has been moved to openpmix/prrte. Signed-off-by: Wenduo Wang --- contrib/ompi_cplusplus.txt | 1 - opal/util/Makefile.am | 2 - opal/util/opal_pty.c | 256 ------------------------------------- opal/util/opal_pty.h | 53 -------- 4 files changed, 312 deletions(-) delete mode 100644 opal/util/opal_pty.c delete mode 100644 opal/util/opal_pty.h diff --git a/contrib/ompi_cplusplus.txt b/contrib/ompi_cplusplus.txt index a61994b0e69..35f2c95e36a 100644 --- a/contrib/ompi_cplusplus.txt +++ b/contrib/ompi_cplusplus.txt @@ -132,7 +132,6 @@ ./opal/util/few.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/keyval_parse.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/malloc.h: defined(c_plusplus) defined(__cplusplus) -./opal/util/opal_pty.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/os_path.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/qsort.h: defined(c_plusplus) defined(__cplusplus) ./opal/util/show_help_lex.h: defined(c_plusplus) defined(__cplusplus) diff --git a/opal/util/Makefile.am b/opal/util/Makefile.am index 646f44412b2..23f6b0ccd67 100644 --- a/opal/util/Makefile.am +++ b/opal/util/Makefile.am @@ -63,7 +63,6 @@ headers = \ numtostr.h \ opal_environ.h \ opal_getcwd.h \ - opal_pty.h \ os_dirpath.h \ os_path.h \ output.h \ @@ -108,7 +107,6 @@ libopalutil_core_la_SOURCES = \ numtostr.c \ opal_environ.c \ opal_getcwd.c \ - opal_pty.c \ os_dirpath.c \ os_path.c \ output.c \ diff --git a/opal/util/opal_pty.c b/opal/util/opal_pty.c deleted file mode 100644 index adbbc8570bb..00000000000 --- a/opal/util/opal_pty.c +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2018 Cisco Systems, Inc. All rights reserved - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -/*- - * Copyright (c) 1990, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "opal_config.h" - -#ifdef HAVE_SYS_CDEFS_H -# include -#endif -#ifdef HAVE_SYS_TYPES_H -# include -#endif -#include -#ifdef HAVE_SYS_IOCTL_H -# include -#endif -#ifdef HAVE_FCNTL_H -# include -#endif -#ifdef HAVE_TERMIOS_H -# include -#else -# ifdef HAVE_TERMIO_H -# include -# endif -#endif -#include -#ifdef HAVE_UNISTD_H -# include -#endif -#include -#include -#ifdef HAVE_GRP_H -# include -#endif -#ifdef HAVE_PTY_H -# include -#endif -#ifdef HAVE_UTMP_H -# include -#endif - -#ifdef HAVE_PTSNAME -# include -# ifdef HAVE_STROPTS_H -# include -# endif -#endif - -#ifdef HAVE_UTIL_H -# include -#endif - -#include "opal/util/opal_pty.h" - -/* The only public interface is openpty - all others are to support - openpty() */ - -#if OPAL_ENABLE_PTY_SUPPORT == 0 - -int opal_openpty(int *amaster, int *aslave, char *name, void *termp, void *winpp) -{ - return -1; -} - -#elif defined(HAVE_OPENPTY) - -int opal_openpty(int *amaster, int *aslave, char *name, struct termios *termp, struct winsize *winp) -{ - return openpty(amaster, aslave, name, termp, winp); -} - -#else - -/* implement openpty in terms of ptym_open and ptys_open */ - -static int ptym_open(char *pts_name); -static int ptys_open(int fdm, char *pts_name); - -int opal_openpty(int *amaster, int *aslave, char *name, struct termios *termp, struct winsize *winp) -{ - char line[20]; - *amaster = ptym_open(line); - if (*amaster < 0) { - return -1; - } - *aslave = ptys_open(*amaster, line); - if (*aslave < 0) { - close(*amaster); - return -1; - } - if (name) { - // We don't know the max length of name, but we do know the - // max length of the source, so at least use that. - opal_string_copy(name, line, sizeof(line)); - } -# ifndef TCSAFLUSH -# define TCSAFLUSH TCSETAF -# endif - if (termp) { - (void) tcsetattr(*aslave, TCSAFLUSH, termp); - } -# ifdef TIOCSWINSZ - if (winp) { - (void) ioctl(*aslave, TIOCSWINSZ, (char *) winp); - } -# endif - return 0; -} - -static int ptym_open(char *pts_name) -{ - int fdm; -# ifdef HAVE_PTSNAME - char *ptr; - -# ifdef _AIX - strcpy(pts_name, "/dev/ptc"); -# else - strcpy(pts_name, "/dev/ptmx"); -# endif - fdm = open(pts_name, O_RDWR); - if (fdm < 0) { - return -1; - } - if (grantpt(fdm) < 0) { /* grant access to slave */ - close(fdm); - return -2; - } - if (unlockpt(fdm) < 0) { /* clear slave's lock flag */ - close(fdm); - return -3; - } - ptr = ptsname(fdm); - if (ptr == NULL) { /* get slave's name */ - close(fdm); - return -4; - } - strcpy(pts_name, ptr); /* return name of slave */ - return fdm; /* return fd of master */ -# else - char *ptr1, *ptr2; - - strcpy(pts_name, "/dev/ptyXY"); - /* array index: 012345689 (for references in following code) */ - for (ptr1 = "pqrstuvwxyzPQRST"; *ptr1 != 0; ptr1++) { - pts_name[8] = *ptr1; - for (ptr2 = "0123456789abcdef"; *ptr2 != 0; ptr2++) { - pts_name[9] = *ptr2; - /* try to open master */ - fdm = open(pts_name, O_RDWR); - if (fdm < 0) { - if (errno == ENOENT) { /* different from EIO */ - return -1; /* out of pty devices */ - } else { - continue; /* try next pty device */ - } - } - pts_name[5] = 't'; /* change "pty" to "tty" */ - return fdm; /* got it, return fd of master */ - } - } - return -1; /* out of pty devices */ -# endif -} - -static int ptys_open(int fdm, char *pts_name) -{ - int fds; -# ifdef HAVE_PTSNAME - /* following should allocate controlling terminal */ - fds = open(pts_name, O_RDWR); - if (fds < 0) { - close(fdm); - return -5; - } -# if defined(__SVR4) && defined(__sun) - if (ioctl(fds, I_PUSH, "ptem") < 0) { - close(fdm); - close(fds); - return -6; - } - if (ioctl(fds, I_PUSH, "ldterm") < 0) { - close(fdm); - close(fds); - return -7; - } -# endif - - return fds; -# else - int gid; - struct group *grptr; - - grptr = getgrnam("tty"); - if (grptr != NULL) { - gid = grptr->gr_gid; - } else { - gid = -1; /* group tty is not in the group file */ - } - /* following two functions don't work unless we're root */ - chown(pts_name, getuid(), gid); - chmod(pts_name, S_IRUSR | S_IWUSR | S_IWGRP); - fds = open(pts_name, O_RDWR); - if (fds < 0) { - close(fdm); - return -1; - } - return fds; -# endif -} - -#endif /* #ifdef HAVE_OPENPTY */ diff --git a/opal/util/opal_pty.h b/opal/util/opal_pty.h deleted file mode 100644 index f30cd97d5ec..00000000000 --- a/opal/util/opal_pty.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2006 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OPAL_UTIL_PTY_H -#define OPAL_UTIL_PTY_H - -#include "opal_config.h" - -#ifdef HAVE_UTIL_H -# include -#endif -#ifdef HAVE_LIBUTIL_H -# include -#endif -#ifdef HAVE_TERMIOS_H -# include -#else -# ifdef HAVE_TERMIO_H -# include -# endif -#endif - -BEGIN_C_DECLS - -#if OPAL_ENABLE_PTY_SUPPORT - -OPAL_DECLSPEC int opal_openpty(int *amaster, int *aslave, char *name, struct termios *termp, - struct winsize *winp); - -#else - -OPAL_DECLSPEC int opal_openpty(int *amaster, int *aslave, char *name, void *termp, void *winpp); - -#endif - -END_C_DECLS - -#endif /* OPAL_UTIL_PTY_H */ From fff842684005f556343209c0f4c2e88133f19b51 Mon Sep 17 00:00:00 2001 From: Evgeny Baskakov Date: Mon, 25 Sep 2023 12:25:40 -0700 Subject: [PATCH 10/15] Bugfix in OMPI_ARRAY_FINT_2_INT_ALLOC and OMPI_ARRAY_LOGICAL_2_INT_ALLOC macros for incorrect storage size calculation. Signed-off-by: Evgeny Baskakov --- ompi/mpi/fortran/base/fint_2_int.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ompi/mpi/fortran/base/fint_2_int.h b/ompi/mpi/fortran/base/fint_2_int.h index d3c71454386..ec2ba43fa1b 100644 --- a/ompi/mpi/fortran/base/fint_2_int.h +++ b/ompi/mpi/fortran/base/fint_2_int.h @@ -60,7 +60,7 @@ /* This is for OUT parameters. Does only alloc */ #define OMPI_ARRAY_FINT_2_INT_ALLOC(in, n) \ - OMPI_ARRAY_NAME_CONVERT(in) = malloc(n * sizeof(int)) + OMPI_ARRAY_NAME_CONVERT(in) = malloc((n) * sizeof(int)) /* This is for IN/IN-OUT parameters. Does alloc and assignment */ #define OMPI_ARRAY_FINT_2_INT(in, n) \ @@ -117,7 +117,7 @@ /* This is for OUT parameters. Does only alloc */ #define OMPI_ARRAY_FINT_2_INT_ALLOC(in, n) \ - OMPI_ARRAY_NAME_CONVERT(in) = malloc(n * sizeof(int)) + OMPI_ARRAY_NAME_CONVERT(in) = malloc((n) * sizeof(int)) #define OMPI_ARRAY_FINT_2_INT(in, n) \ do { \ @@ -204,7 +204,7 @@ # define OMPI_LOGICAL_ARRAY_NAME_DECL(in) int * c_##in # define OMPI_LOGICAL_ARRAY_NAME_CONVERT(in) c_##in # define OMPI_ARRAY_LOGICAL_2_INT_ALLOC(in,n) \ - OMPI_LOGICAL_ARRAY_NAME_CONVERT(in) = malloc(n * sizeof(int)) + OMPI_LOGICAL_ARRAY_NAME_CONVERT(in) = malloc((n) * sizeof(int)) # define OMPI_ARRAY_LOGICAL_2_INT_CLEANUP(in) \ free(OMPI_LOGICAL_ARRAY_NAME_CONVERT(in)) From 62d19b01534fd3f781e33f2762ef5371a9dc95f2 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Thu, 21 Sep 2023 13:38:37 +0000 Subject: [PATCH 11/15] pr-checks: update compile-rocm workflow - use rocm-hip-runtime instead of rocm-hip-sdk macropackage to reduce the size of the installed packages - add a clean-up step to the rocm-compile script to help potentially with the memory-consumption of the github actions environment. Signed-off-by: Edgar Gabriel --- .github/workflows/compile-rocm.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/compile-rocm.yaml b/.github/workflows/compile-rocm.yaml index 7c98e1a5916..cf4ad932032 100644 --- a/.github/workflows/compile-rocm.yaml +++ b/.github/workflows/compile-rocm.yaml @@ -17,7 +17,7 @@ jobs: curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg echo 'deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/debian focal main' | sudo tee /etc/apt/sources.list.d/rocm.list sudo apt-get update - sudo apt-get install -y rocm-hip-sdk + sudo apt-get install -y rocm-hip-runtime - uses: actions/checkout@v3 with: submodules: recursive @@ -26,3 +26,9 @@ jobs: ./autogen.pl ./configure --prefix=${PWD}/install --with-rocm=/opt/rocm --disable-mpi-fortran make -j + - name: Clean up + run: | + ls -la ./ + rm -rf ./* + rm -rf ./.??* + ls -la ./ \ No newline at end of file From 2a38fe4390d3ec6ba95996272ec1360dc32c7f4c Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Fri, 29 Sep 2023 15:10:45 -0400 Subject: [PATCH 12/15] Update news in preparation for v5.0.0rc13. Signed-off-by: Austen Lauria --- docs/news/news-v5.0.x.rst | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/docs/news/news-v5.0.x.rst b/docs/news/news-v5.0.x.rst index 749dfa1c7bf..54445d8b743 100644 --- a/docs/news/news-v5.0.x.rst +++ b/docs/news/news-v5.0.x.rst @@ -4,9 +4,9 @@ Open MPI v5.0.x series This file contains all the NEWS updates for the Open MPI v5.0.x series, in reverse chronological order. -Open MPI version 5.0.0rc12 +Open MPI version 5.0.0rc13 -------------------------- -:Date: 19 May 2023 +:Date: 29 September 2023 .. admonition:: The MPIR API has been removed :class: warning @@ -66,30 +66,23 @@ Open MPI version 5.0.0rc12 Libevent symbols and then statically pulled the library into ``libmpi.so``. -- Changes since rc11: - - - ``accelerator/rocm``: add SYNC_MEMOPS support. - - Update PMIx, PRRTe, and OAC submodule pointers. - - Fix ``mca_btl_ofi_flush()`` in multithreaded environments.. - - ``smcuda``: fixed an edge case when building MCA components as - dynamic shared objects. - - Fix ``MPI_Session_init()`` bug if all previous sessions are - finalized. - - Fix `mpi4py `_ hang in - ``MPI_Intercomm_create_from_groups()``. - - Fix finalization segfault with OSHMEM 4.1.5. - - Improve AVX detection. Fixes ``op/avx`` link failure with the - ``nvhpc`` compiler. - - Fix incorrect results with ``pml/ucx`` using Intel compiler. - - Fix segfault when broadcasting large MPI structs. - - Add platform files for Google Cloud HPC. - - UCC/HCOLL: Fix ``MPI_Waitall()`` for non blokcing collectives. - - Fix pre-built docs check. +- Changes since rc12: + + - Update PMIx to the ``v4.2.6`` release tag. Hash: ``f20e0d5``. + - Update PRRTE to the ``v3.0.1`` release tag. Hash: ``63370ca``. + - Lots of documentation updates. + - Fixed parameter name in ``MPI_Intercomm_merge``. Thanks to Yan Wu for the report. + - ``OFI``: Update NIC selection to determine optimal interfaces from the current process. + - Fix reordering of received data in ``MPI_Gather``. + - Disable builds with ``HWLOC`` versions >= 3.0.0. This is currently not supported. + - Fix re-ordering of ranks in ``MPI_Dist_graph_create``. + - ``coll/HAN``: Fix bug when using ``MPI_IN_PLACE`` with ``MPI_Reduce``. + - Fix ``MPI_Type_Dup`` to propagate errors from inner calls. + - Fix the compilation of the monitoring infrastructure. + - Various other bug fixes. - All other notable updates for v5.0.0: - - Update PMIx to the ``v4.2`` branch - current hash: ``f34a7ce2``. - - Update PRRTE to the ``v3.0`` branch - current hash: ``c4925aa5cc``. - New Features: - ULFM Fault Tolerance support has been added. See :ref:`the ULFM From 5adb240f6509e86407d642214251b5640c2344f8 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 2 Oct 2023 20:31:42 -0700 Subject: [PATCH 13/15] docs: Fix build case with --disable-prrte Fix a small issue in properly setting filename when building the empty schizo rst file. Signed-off-by: Brian Barrett --- docs/Makefile.am | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/Makefile.am b/docs/Makefile.am index dc9a085e99e..eacf2baf9b8 100644 --- a/docs/Makefile.am +++ b/docs/Makefile.am @@ -962,10 +962,9 @@ $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(OMPI_SCHIZO_OMPI_RS else $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(builddir)/schizo-ompi-rst-content $(builddir)/schizo-ompi-rst-content/schizo-ompi-cli.rstxt: $(srcdir)/no-prrte-content.rst.txt - if test ! -d "$$dir"; then mkdir "$$dir"; fi + dir=`dirname $@`; if test ! -d "$$dir"; then mkdir "$$dir"; fi $(OMPI_V_SPHINX_COPYRST) \ - dir=`dirname $@`; \ - cp -pf $(srcdir)/no-prrte-content.rst.txt "$$dir" + cp -pf $(srcdir)/no-prrte-content.rst.txt "$@" endif $(ALL_MAN_BUILT): $(builddir)/prrte-rst-content From 3ef5dc9a0c901322a6aa190f63f8dbc6af75626d Mon Sep 17 00:00:00 2001 From: Austen Lauria Date: Tue, 3 Oct 2023 08:26:41 -0400 Subject: [PATCH 14/15] Patch the prrte.spec file. This is already fixed in prrte but for v5.0.x and main we'll want this fix applied for any rpm generation. This can safely be removed once main and v5.0.x advance. On v5.0.x this will be the next prrte release. For main, the next submodule update is fine to remove this. Signed-off-by: Austen Lauria --- autogen.pl | 4 ++++ config/prrte.spec.diff | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 config/prrte.spec.diff diff --git a/autogen.pl b/autogen.pl index 5af4704f2a1..3cb79025dbf 100755 --- a/autogen.pl +++ b/autogen.pl @@ -1643,6 +1643,10 @@ sub replace_config_sub_guess { if (! -f "3rd-party/prrte/configure.ac") { my_die("Could not find pmix files\n"); } + + verbose "Patching prrte.spec file\n"; + system("$patch_prog -N -p0 < ./config/prrte.spec.diff > /dev/null 2>&1"); + push(@subdirs, "3rd-party/prrte/"); $m4 .= "m4_define([package_prrte], [1])\n"; diff --git a/config/prrte.spec.diff b/config/prrte.spec.diff new file mode 100644 index 00000000000..4e8b1a86eb1 --- /dev/null +++ b/config/prrte.spec.diff @@ -0,0 +1,20 @@ +--- 3rd-party/prrte/contrib/dist/linux/prrte.spec 2023-10-03 08:12:43.842625000 -0400 ++++ 3rd-party/prrte/contrib/dist/linux/prrte.spec 2023-10-03 08:12:27.849686000 -0400 +@@ -612,7 +612,7 @@ + %{shell_scripts_path}/%{shell_scripts_basename}.sh + %{shell_scripts_path}/%{shell_scripts_basename}.csh + %endif +-%doc README INSTALL LICENSE ++%doc README.md LICENSE + + %else + +@@ -656,7 +656,7 @@ + %{shell_scripts_path}/%{shell_scripts_basename}.sh + %{shell_scripts_path}/%{shell_scripts_basename}.csh + %endif +-%doc README INSTALL LICENSE ++%doc README.md LICENSE + %{_pkgdatadir} + + %files devel -f devel.files From 776e8babd6868b968d1724161a6999861723b08a Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Tue, 19 Sep 2023 09:47:42 +0300 Subject: [PATCH 15/15] oshmem: Add symmetric remote key handling code At very high scale, having each rank storing each other rank's remote keys for each segment can lead to high memory consumption. We activate symmetric remote key option to generate remote keys that will be deduplicated and then used interchangeably. Signed-off-by: Thomas Vegas --- config/ompi_check_ucx.m4 | 6 +- oshmem/mca/spml/ucx/spml_ucx.c | 183 +++++++++++++++++++++- oshmem/mca/spml/ucx/spml_ucx.h | 41 +++-- oshmem/mca/spml/ucx/spml_ucx_component.c | 39 +++-- oshmem/mca/sshmem/ucx/sshmem_ucx_module.c | 3 +- 5 files changed, 241 insertions(+), 31 deletions(-) diff --git a/config/ompi_check_ucx.m4 b/config/ompi_check_ucx.m4 index fbea98cd7b3..01e39aaf968 100644 --- a/config/ompi_check_ucx.m4 +++ b/config/ompi_check_ucx.m4 @@ -108,7 +108,8 @@ AC_DEFUN([OMPI_CHECK_UCX],[ UCP_PARAM_FIELD_ESTIMATED_NUM_PPN, UCP_WORKER_FLAG_IGNORE_REQUEST_LEAK, UCP_OP_ATTR_FLAG_MULTI_SEND, - UCS_MEMORY_TYPE_RDMA], + UCS_MEMORY_TYPE_RDMA, + UCP_MEM_MAP_SYMMETRIC_RKEY], [], [], [#include ]) AC_CHECK_DECLS([UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS], @@ -124,7 +125,8 @@ AC_DEFUN([OMPI_CHECK_UCX],[ [#include ]) AC_CHECK_DECLS([ucp_tag_send_nbx, ucp_tag_send_sync_nbx, - ucp_tag_recv_nbx], + ucp_tag_recv_nbx, + ucp_rkey_compare], [], [], [#include ]) AC_CHECK_TYPES([ucp_request_param_t], diff --git a/oshmem/mca/spml/ucx/spml_ucx.c b/oshmem/mca/spml/ucx/spml_ucx.c index 570b4d25a7a..5493d78e661 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.c +++ b/oshmem/mca/spml/ucx/spml_ucx.c @@ -22,6 +22,7 @@ #include "opal/datatype/opal_convertor.h" #include "opal/mca/common/ucx/common_ucx.h" #include "opal/util/opal_environ.h" +#include "opal/util/minmax.h" #include "ompi/datatype/ompi_datatype.h" #include "ompi/mca/pml/pml.h" @@ -126,6 +127,171 @@ static ucp_request_param_t mca_spml_ucx_request_param_b = { }; #endif +unsigned +mca_spml_ucx_mem_map_flags_symmetric_rkey(struct mca_spml_ucx *spml_ucx) +{ +#if HAVE_DECL_UCP_MEM_MAP_SYMMETRIC_RKEY + if (spml_ucx->symmetric_rkey_max_count > 0) { + return UCP_MEM_MAP_SYMMETRIC_RKEY; + } +#endif + + return 0; +} + +void mca_spml_ucx_rkey_store_init(mca_spml_ucx_rkey_store_t *store) +{ + store->array = NULL; + store->count = 0; + store->size = 0; +} + +void mca_spml_ucx_rkey_store_cleanup(mca_spml_ucx_rkey_store_t *store) +{ + int i; + + for (i = 0; i < store->count; i++) { + if (store->array[i].refcnt != 0) { + SPML_UCX_ERROR("rkey store destroy: %d/%d has refcnt %d > 0", + i, store->count, store->array[i].refcnt); + } + + ucp_rkey_destroy(store->array[i].rkey); + } + + free(store->array); +} + +/** + * Find position in sorted array for existing or future entry + * + * @param[in] store Store of the entries + * @param[in] worker Common worker for rkeys used + * @param[in] rkey Remote key to search for + * @param[out] index Index of entry + * + * @return + * OSHMEM_ERR_NOT_FOUND: index contains the position where future element + * should be inserted to keep array sorted + * OSHMEM_SUCCESS : index contains the position of the element + * Other error : index is not valid + */ +static int mca_spml_ucx_rkey_store_find(const mca_spml_ucx_rkey_store_t *store, + const ucp_worker_h worker, + const ucp_rkey_h rkey, + int *index) +{ +#if HAVE_DECL_UCP_RKEY_COMPARE + ucp_rkey_compare_params_t params; + int i, result, m, end; + ucs_status_t status; + + for (i = 0, end = store->count; i < end;) { + m = (i + end) / 2; + + params.field_mask = 0; + status = ucp_rkey_compare(worker, store->array[m].rkey, + rkey, ¶ms, &result); + if (status != UCS_OK) { + return OSHMEM_ERROR; + } else if (result == 0) { + *index = m; + return OSHMEM_SUCCESS; + } else if (result > 0) { + end = m; + } else { + i = m + 1; + } + } + + *index = i; + return OSHMEM_ERR_NOT_FOUND; +#else + return OSHMEM_ERROR; +#endif +} + +static void mca_spml_ucx_rkey_store_insert(mca_spml_ucx_rkey_store_t *store, + int i, ucp_rkey_h rkey) +{ + int size; + mca_spml_ucx_rkey_t *tmp; + + if (store->count >= mca_spml_ucx.symmetric_rkey_max_count) { + return; + } + + if (store->count >= store->size) { + size = opal_min(opal_max(store->size, 8) * 2, + mca_spml_ucx.symmetric_rkey_max_count); + tmp = realloc(store->array, size * sizeof(*store->array)); + if (tmp == NULL) { + return; + } + + store->array = tmp; + store->size = size; + } + + memmove(&store->array[i + 1], &store->array[i], + (store->count - i) * sizeof(*store->array)); + store->array[i].rkey = rkey; + store->array[i].refcnt = 1; + store->count++; + return; +} + +/* Takes ownership of input ucp remote key */ +static ucp_rkey_h mca_spml_ucx_rkey_store_get(mca_spml_ucx_rkey_store_t *store, + ucp_worker_h worker, + ucp_rkey_h rkey) +{ + int ret, i; + + if (mca_spml_ucx.symmetric_rkey_max_count == 0) { + return rkey; + } + + ret = mca_spml_ucx_rkey_store_find(store, worker, rkey, &i); + if (ret == OSHMEM_SUCCESS) { + ucp_rkey_destroy(rkey); + store->array[i].refcnt++; + return store->array[i].rkey; + } + + if (ret == OSHMEM_ERR_NOT_FOUND) { + mca_spml_ucx_rkey_store_insert(store, i, rkey); + } + + return rkey; +} + +static void mca_spml_ucx_rkey_store_put(mca_spml_ucx_rkey_store_t *store, + ucp_worker_h worker, + ucp_rkey_h rkey) +{ + mca_spml_ucx_rkey_t *entry; + int ret, i; + + ret = mca_spml_ucx_rkey_store_find(store, worker, rkey, &i); + if (ret != OSHMEM_SUCCESS) { + goto out; + } + + entry = &store->array[i]; + assert(entry->rkey == rkey); + if (--entry->refcnt > 0) { + return; + } + + memmove(&store->array[i], &store->array[i + 1], + (store->count - (i + 1)) * sizeof(*store->array)); + store->count--; + +out: + ucp_rkey_destroy(rkey); +} + int mca_spml_ucx_enable(bool enable) { SPML_UCX_VERBOSE(50, "*** ucx ENABLED ****"); @@ -240,6 +406,7 @@ int mca_spml_ucx_ctx_mkey_add(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn { int rc; ucs_status_t err; + ucp_rkey_h rkey; rc = mca_spml_ucx_ctx_mkey_new(ucx_ctx, pe, segno, ucx_mkey); if (OSHMEM_SUCCESS != rc) { @@ -248,11 +415,18 @@ int mca_spml_ucx_ctx_mkey_add(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn } if (mkey->u.data) { - err = ucp_ep_rkey_unpack(ucx_ctx->ucp_peers[pe].ucp_conn, mkey->u.data, &((*ucx_mkey)->rkey)); + err = ucp_ep_rkey_unpack(ucx_ctx->ucp_peers[pe].ucp_conn, mkey->u.data, &rkey); if (UCS_OK != err) { SPML_UCX_ERROR("failed to unpack rkey: %s", ucs_status_string(err)); return OSHMEM_ERROR; } + + if (!oshmem_proc_on_local_node(pe)) { + rkey = mca_spml_ucx_rkey_store_get(&ucx_ctx->rkey_store, ucx_ctx->ucp_worker[0], rkey); + } + + (*ucx_mkey)->rkey = rkey; + rc = mca_spml_ucx_ctx_mkey_cache(ucx_ctx, mkey, segno, pe); if (OSHMEM_SUCCESS != rc) { SPML_UCX_ERROR("mca_spml_ucx_ctx_mkey_cache failed"); @@ -267,7 +441,7 @@ int mca_spml_ucx_ctx_mkey_del(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn ucp_peer_t *ucp_peer; int rc; ucp_peer = &(ucx_ctx->ucp_peers[pe]); - ucp_rkey_destroy(ucx_mkey->rkey); + mca_spml_ucx_rkey_store_put(&ucx_ctx->rkey_store, ucx_ctx->ucp_worker[0], ucx_mkey->rkey); ucx_mkey->rkey = NULL; rc = mca_spml_ucx_peer_mkey_cache_del(ucp_peer, segno); if(OSHMEM_SUCCESS != rc){ @@ -725,7 +899,8 @@ sshmem_mkey_t *mca_spml_ucx_register(void* addr, UCP_MEM_MAP_PARAM_FIELD_FLAGS; mem_map_params.address = addr; mem_map_params.length = size; - mem_map_params.flags = flags; + mem_map_params.flags = flags | + mca_spml_ucx_mem_map_flags_symmetric_rkey(&mca_spml_ucx); status = ucp_mem_map(mca_spml_ucx.ucp_context, &mem_map_params, &mem_h); if (UCS_OK != status) { @@ -917,6 +1092,8 @@ static int mca_spml_ucx_ctx_create_common(long options, mca_spml_ucx_ctx_t **ucx } } + mca_spml_ucx_rkey_store_init(&ucx_ctx->rkey_store); + *ucx_ctx_p = ucx_ctx; return OSHMEM_SUCCESS; diff --git a/oshmem/mca/spml/ucx/spml_ucx.h b/oshmem/mca/spml/ucx/spml_ucx.h index a93ff3756a3..2fec131ad2d 100644 --- a/oshmem/mca/spml/ucx/spml_ucx.h +++ b/oshmem/mca/spml/ucx/spml_ucx.h @@ -76,18 +76,31 @@ struct ucp_peer { size_t mkeys_cnt; }; typedef struct ucp_peer ucp_peer_t; - + +/* An rkey_store entry */ +typedef struct mca_spml_ucx_rkey { + ucp_rkey_h rkey; + int refcnt; +} mca_spml_ucx_rkey_t; + +typedef struct mca_spml_ucx_rkey_store { + mca_spml_ucx_rkey_t *array; + int size; + int count; +} mca_spml_ucx_rkey_store_t; + struct mca_spml_ucx_ctx { - ucp_worker_h *ucp_worker; - ucp_peer_t *ucp_peers; - long options; - opal_bitmap_t put_op_bitmap; - unsigned long nb_progress_cnt; - unsigned int ucp_workers; - int *put_proc_indexes; - unsigned put_proc_count; - bool synchronized_quiet; - int strong_sync; + ucp_worker_h *ucp_worker; + ucp_peer_t *ucp_peers; + long options; + opal_bitmap_t put_op_bitmap; + unsigned long nb_progress_cnt; + unsigned int ucp_workers; + int *put_proc_indexes; + unsigned put_proc_count; + bool synchronized_quiet; + int strong_sync; + mca_spml_ucx_rkey_store_t rkey_store; }; typedef struct mca_spml_ucx_ctx mca_spml_ucx_ctx_t; @@ -128,6 +141,7 @@ struct mca_spml_ucx { unsigned long nb_ucp_worker_progress; unsigned int ucp_workers; unsigned int ucp_worker_cnt; + int symmetric_rkey_max_count; }; typedef struct mca_spml_ucx mca_spml_ucx_t; @@ -280,6 +294,11 @@ extern int mca_spml_ucx_team_fcollect(shmem_team_t team, void extern int mca_spml_ucx_team_reduce(shmem_team_t team, void *dest, const void *source, size_t nreduce, int operation, int datatype); +extern unsigned +mca_spml_ucx_mem_map_flags_symmetric_rkey(struct mca_spml_ucx *spml_ucx); + +extern void mca_spml_ucx_rkey_store_init(mca_spml_ucx_rkey_store_t *store); +extern void mca_spml_ucx_rkey_store_cleanup(mca_spml_ucx_rkey_store_t *store); static inline int mca_spml_ucx_peer_mkey_get(ucp_peer_t *ucp_peer, int index, spml_ucx_cached_mkey_t **out_rmkey) diff --git a/oshmem/mca/spml/ucx/spml_ucx_component.c b/oshmem/mca/spml/ucx/spml_ucx_component.c index 1ab00ac1786..e44a800a8be 100644 --- a/oshmem/mca/spml/ucx/spml_ucx_component.c +++ b/oshmem/mca/spml/ucx/spml_ucx_component.c @@ -153,6 +153,10 @@ static int mca_spml_ucx_component_register(void) "Enable asynchronous progress thread", &mca_spml_ucx.async_progress); + mca_spml_ucx_param_register_int("symmetric_rkey_max_count", 0, + "Size of the symmetric key store. Non-zero to enable, typical use 5000", + &mca_spml_ucx.symmetric_rkey_max_count); + mca_spml_ucx_param_register_int("async_tick_usec", 3000, "Asynchronous progress tick granularity (in usec)", &mca_spml_ucx.async_tick); @@ -332,6 +336,8 @@ static int spml_ucx_init(void) mca_spml_ucx_ctx_default.ucp_workers++; } + mca_spml_ucx_rkey_store_init(&mca_spml_ucx_ctx_default.rkey_store); + wrk_attr.field_mask = UCP_WORKER_ATTR_FIELD_THREAD_MODE; err = ucp_worker_query(mca_spml_ucx_ctx_default.ucp_worker[0], &wrk_attr); @@ -436,10 +442,25 @@ static void _ctx_cleanup(mca_spml_ucx_ctx_t *ctx) free(ctx->ucp_peers); } +static void mca_spml_ucx_ctx_fini(mca_spml_ucx_ctx_t *ctx) +{ + unsigned int i; + + mca_spml_ucx_rkey_store_cleanup(&ctx->rkey_store); + for (i = 0; i < ctx->ucp_workers; i++) { + ucp_worker_destroy(ctx->ucp_worker[i]); + } + free(ctx->ucp_worker); + if (ctx != &mca_spml_ucx_ctx_default) { + free(ctx); + } +} + static int mca_spml_ucx_component_fini(void) { int fenced = 0, i; int ret = OSHMEM_SUCCESS; + mca_spml_ucx_ctx_t *ctx; opal_progress_unregister(spml_ucx_default_progress); if (mca_spml_ucx.active_array.ctxs_count) { @@ -492,36 +513,26 @@ static int mca_spml_ucx_component_fini(void) } } - /* delete all workers */ for (i = 0; i < mca_spml_ucx.active_array.ctxs_count; i++) { - ucp_worker_destroy(mca_spml_ucx.active_array.ctxs[i]->ucp_worker[0]); - free(mca_spml_ucx.active_array.ctxs[i]->ucp_worker); - free(mca_spml_ucx.active_array.ctxs[i]); + mca_spml_ucx_ctx_fini(mca_spml_ucx.active_array.ctxs[i]); } for (i = 0; i < mca_spml_ucx.idle_array.ctxs_count; i++) { - ucp_worker_destroy(mca_spml_ucx.idle_array.ctxs[i]->ucp_worker[0]); - free(mca_spml_ucx.idle_array.ctxs[i]->ucp_worker); - free(mca_spml_ucx.idle_array.ctxs[i]); + mca_spml_ucx_ctx_fini(mca_spml_ucx.idle_array.ctxs[i]); } if (mca_spml_ucx_ctx_default.ucp_worker) { - for (i = 0; i < (signed int)mca_spml_ucx.ucp_workers; i++) { - ucp_worker_destroy(mca_spml_ucx_ctx_default.ucp_worker[i]); - } - free(mca_spml_ucx_ctx_default.ucp_worker); + mca_spml_ucx_ctx_fini(&mca_spml_ucx_ctx_default); } if (mca_spml_ucx.aux_ctx != NULL) { - ucp_worker_destroy(mca_spml_ucx.aux_ctx->ucp_worker[0]); - free(mca_spml_ucx.aux_ctx->ucp_worker); + mca_spml_ucx_ctx_fini(mca_spml_ucx.aux_ctx); } mca_spml_ucx.enabled = false; /* not anymore */ free(mca_spml_ucx.active_array.ctxs); free(mca_spml_ucx.idle_array.ctxs); - free(mca_spml_ucx.aux_ctx); SHMEM_MUTEX_DESTROY(mca_spml_ucx.internal_mutex); pthread_mutex_destroy(&mca_spml_ucx.ctx_create_mutex); diff --git a/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c b/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c index 262bef5ffe6..688bfce6f19 100644 --- a/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c +++ b/oshmem/mca/sshmem/ucx/sshmem_ucx_module.c @@ -118,7 +118,8 @@ segment_create_internal(map_segment_t *ds_buf, void *address, size_t size, mem_map_params.address = address; mem_map_params.length = size; - mem_map_params.flags = flags; + mem_map_params.flags = flags | + mca_spml_ucx_mem_map_flags_symmetric_rkey(spml); mem_map_params.memory_type = mem_type; status = ucp_mem_map(spml->ucp_context, &mem_map_params, &mem_h);