From 2166c22c431351af0a51b59fbf5a85146f0ef81a Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Tue, 21 Dec 2021 10:48:04 -0600 Subject: [PATCH 1/5] sharedfp_sm_file_component_query: add file open to ensure correct operations try to actually open the sharedfp/sm file during the query operation to ensure that the component can actually run. This is based on some reports on the mailing list that the sharedfp/sm operation causes problems in certain circumstances. Fixes issue #9656 Signed-off-by: Edgar Gabriel (cherry picked from commit d8464d2df5355d6c51d0c91def33e146fecedfa2) --- ompi/mca/sharedfp/sm/sharedfp_sm.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/ompi/mca/sharedfp/sm/sharedfp_sm.c b/ompi/mca/sharedfp/sm/sharedfp_sm.c index 498c02d716a..c572caffc3c 100644 --- a/ompi/mca/sharedfp/sm/sharedfp_sm.c +++ b/ompi/mca/sharedfp/sm/sharedfp_sm.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2008-2013 University of Houston. All rights reserved. + * Copyright (c) 2008-2021 University of Houston. All rights reserved. * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -94,6 +94,29 @@ struct mca_sharedfp_base_module_1_0_0_t * mca_sharedfp_sm_component_file_query(o return NULL; } } + + + /* Check that we can actually open the required file */ + char *filename_basename = basename((char*)fh->f_filename); + char *sm_filename; + int comm_cid = -1; + int pid = ompi_comm_rank (comm); + + asprintf(&sm_filename, "%s/%s_cid-%d-%d.sm", ompi_process_info.job_session_dir, + filename_basename, comm_cid, pid); + + int sm_fd = open(sm_filename, O_RDWR | O_CREAT, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if ( sm_fd == -1){ + /*error opening file*/ + opal_output(0,"mca_sharedfp_sm_component_file_query: Error, unable to open file for mmap: %s\n",sm_filename); + free(sm_filename); + return NULL; + } + close (sm_fd); + unlink(sm_filename); + free (sm_filename); + /* This module can run */ *priority = mca_sharedfp_sm_priority; return &sm; From 14d0c4329439de81ef283762212d4a2ed388cd63 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Tue, 28 Dec 2021 12:28:37 -0500 Subject: [PATCH 2/5] sharedfp: use opal_basename() Fix macOS build by using opal_basename() instead of basename(). Signed-off-by: Jeff Squyres (cherry picked from commit c79291ea86d9c9590cf6102c173d3f348b6a463f) --- ompi/mca/sharedfp/sm/sharedfp_sm.c | 5 ++++- ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ompi/mca/sharedfp/sm/sharedfp_sm.c b/ompi/mca/sharedfp/sm/sharedfp_sm.c index c572caffc3c..a0284f54a92 100644 --- a/ompi/mca/sharedfp/sm/sharedfp_sm.c +++ b/ompi/mca/sharedfp/sm/sharedfp_sm.c @@ -12,6 +12,7 @@ * Copyright (c) 2008-2021 University of Houston. All rights reserved. * Copyright (c) 2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2021 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -31,6 +32,8 @@ #include "ompi/mca/sharedfp/base/base.h" #include "ompi/mca/sharedfp/sm/sharedfp_sm.h" +#include "opal/util/basename.h" + /* * ******************************************************************* * ************************ actions structure ************************ @@ -97,7 +100,7 @@ struct mca_sharedfp_base_module_1_0_0_t * mca_sharedfp_sm_component_file_query(o /* Check that we can actually open the required file */ - char *filename_basename = basename((char*)fh->f_filename); + char *filename_basename = opal_basename((char*)fh->f_filename); char *sm_filename; int comm_cid = -1; int pid = ompi_comm_rank (comm); diff --git a/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c b/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c index 6526ee52480..9b16d48ef17 100644 --- a/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c +++ b/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c @@ -13,7 +13,7 @@ * Copyright (c) 2013 Intel, Inc. All rights reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2015-2021 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * @@ -41,6 +41,8 @@ #include "ompi/mca/sharedfp/sharedfp.h" #include "ompi/mca/sharedfp/base/base.h" +#include "opal/util/basename.h" + #include #include #include @@ -101,7 +103,7 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, ** and then mapping it to memory ** For sharedfp we also want to put the file backed shared memory into the tmp directory */ - filename_basename = basename((char*)filename); + filename_basename = opal_basename((char*)filename); /* format is "%s/%s_cid-%d-%d.sm", see below */ sm_filename_length = strlen(ompi_process_info.job_session_dir) + 1 + strlen(filename_basename) + 5 + (3*sizeof(uint32_t)+1) + 4; sm_filename = (char*) malloc( sizeof(char) * sm_filename_length); From a61f979e0f2fc1992e51d9ed3cedb431b35ef234 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Thu, 30 Dec 2021 07:08:30 -0800 Subject: [PATCH 3/5] sharedfp: fix minor memory leaks Commit c79291ea86d9c9590cf6102c173d3f348b6a463f accidentally introduced minor memory leaks by calling opal_basename() without free()ing the results. Fixes CID 1496830. Signed-off-by: Jeff Squyres (cherry picked from commit 03985980956187b9f37f249b265e4e5ec7389280) --- ompi/mca/sharedfp/sm/sharedfp_sm.c | 1 + ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/ompi/mca/sharedfp/sm/sharedfp_sm.c b/ompi/mca/sharedfp/sm/sharedfp_sm.c index a0284f54a92..c32a0d1fbeb 100644 --- a/ompi/mca/sharedfp/sm/sharedfp_sm.c +++ b/ompi/mca/sharedfp/sm/sharedfp_sm.c @@ -107,6 +107,7 @@ struct mca_sharedfp_base_module_1_0_0_t * mca_sharedfp_sm_component_file_query(o asprintf(&sm_filename, "%s/%s_cid-%d-%d.sm", ompi_process_info.job_session_dir, filename_basename, comm_cid, pid); + free(filename_basename); int sm_fd = open(sm_filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); diff --git a/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c b/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c index 9b16d48ef17..b1702b148ca 100644 --- a/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c +++ b/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c @@ -109,6 +109,7 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, sm_filename = (char*) malloc( sizeof(char) * sm_filename_length); if (NULL == sm_filename) { opal_output(0, "mca_sharedfp_sm_file_open: Error, unable to malloc sm_filename\n"); + free(filename_basename); free(sm_data); free(sh); return OMPI_ERR_OUT_OF_RESOURCE; @@ -122,6 +123,7 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, err = comm->c_coll->coll_bcast (&int_pid, 1, MPI_INT, 0, comm, comm->c_coll->coll_bcast_module ); if ( OMPI_SUCCESS != err ) { opal_output(0,"mca_sharedfp_sm_file_open: Error in bcast operation \n"); + free(filename_basename); free(sm_filename); free(sm_data); free(sh); @@ -136,6 +138,7 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, if ( sm_fd == -1){ /*error opening file*/ opal_output(0,"mca_sharedfp_sm_file_open: Error, unable to open file for mmap: %s\n",sm_filename); + free(filename_basename); free(sm_filename); free(sm_data); free(sh); @@ -152,6 +155,7 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, err = comm->c_coll->coll_barrier (comm, comm->c_coll->coll_barrier_module ); if ( OMPI_SUCCESS != err ) { opal_output(0,"mca_sharedfp_sm_file_open: Error in barrier operation \n"); + free(filename_basename); free(sm_filename); free(sm_data); free(sh); @@ -169,6 +173,7 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, err = OMPI_ERROR; opal_output(0, "mca_sharedfp_sm_file_open: Error, unable to mmap file: %s\n",sm_filename); opal_output(0, "%s\n", strerror(errno)); + free(filename_basename); free(sm_filename); free(sm_data); free(sh); @@ -187,6 +192,10 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, sm_data->sem_name = (char*) malloc( sizeof(char) * 253); snprintf(sm_data->sem_name,252,"OMPIO_%s",filename_basename); #endif + // We're now done with filename_basename. Free it here so that we + // don't have to keep freeing it in the error/return cases. + free(filename_basename); + filename_basename = NULL; if( (sm_data->mutex = sem_open(sm_data->sem_name, O_CREAT, 0644, 1)) != SEM_FAILED ) { #elif defined(HAVE_SEM_INIT) From 05e25489ebab23e5ec2e42ab249de357b35d57ad Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Mon, 3 Jan 2022 09:25:40 -0600 Subject: [PATCH 4/5] sharedfp_sm_file_open: minor code cleanup use asprintf instead of calculating lenght of the string and using malloc. This solution matches now the code used during the file_query operation of the same component. Signed-off-by: Edgar Gabriel (cherry picked from commit 4af40ed96742938182e66f78457e3f1da7796ee3) --- ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c b/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c index b1702b148ca..fd4d087083e 100644 --- a/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c +++ b/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2013-2018 University of Houston. All rights reserved. + * Copyright (c) 2013-2021 University of Houston. All rights reserved. * Copyright (c) 2013 Intel, Inc. All rights reserved. * Copyright (c) 2015-2018 Research Organization for Information Science * and Technology (RIST). All rights reserved. @@ -59,7 +59,6 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, struct mca_sharedfp_sm_data * sm_data = NULL; char * filename_basename; char * sm_filename; - int sm_filename_length; struct mca_sharedfp_sm_offset * sm_offset_ptr; struct mca_sharedfp_sm_offset sm_offset; int sm_fd; @@ -105,15 +104,6 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, */ filename_basename = opal_basename((char*)filename); /* format is "%s/%s_cid-%d-%d.sm", see below */ - sm_filename_length = strlen(ompi_process_info.job_session_dir) + 1 + strlen(filename_basename) + 5 + (3*sizeof(uint32_t)+1) + 4; - sm_filename = (char*) malloc( sizeof(char) * sm_filename_length); - if (NULL == sm_filename) { - opal_output(0, "mca_sharedfp_sm_file_open: Error, unable to malloc sm_filename\n"); - free(filename_basename); - free(sm_data); - free(sh); - return OMPI_ERR_OUT_OF_RESOURCE; - } comm_cid = ompi_comm_get_cid(comm); if ( 0 == fh->f_rank ) { @@ -130,7 +120,7 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, return err; } - snprintf(sm_filename, sm_filename_length, "%s/%s_cid-%d-%d.sm", ompi_process_info.job_session_dir, + asprintf(&sm_filename, "%s/%s_cid-%d-%d.sm", ompi_process_info.job_session_dir, filename_basename, comm_cid, int_pid); /* open shared memory file, initialize to 0, map into memory */ sm_fd = open(sm_filename, O_RDWR | O_CREAT, From 889092d9d86d01751061ba83b6d5ff7495052d20 Mon Sep 17 00:00:00 2001 From: Edgar Gabriel Date: Tue, 4 Jan 2022 04:04:10 -0600 Subject: [PATCH 5/5] sharedfp_sm_file_open: remove illegal free accidentally left an invalid free after the rewriting the code to use asprintf. Fixes Coverty CID 1496849 Signed-off-by: Edgar Gabriel (cherry picked from commit a260b884b809522537dd1c2d3e134f49f4f1f4c0) --- ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c b/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c index fd4d087083e..0b56a76443c 100644 --- a/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c +++ b/ompi/mca/sharedfp/sm/sharedfp_sm_file_open.c @@ -114,7 +114,6 @@ int mca_sharedfp_sm_file_open (struct ompi_communicator_t *comm, if ( OMPI_SUCCESS != err ) { opal_output(0,"mca_sharedfp_sm_file_open: Error in bcast operation \n"); free(filename_basename); - free(sm_filename); free(sm_data); free(sh); return err;