From 071169b4b7cf2f32eda3a93a8f396562f27cf056 Mon Sep 17 00:00:00 2001 From: Thomas Vegas Date: Mon, 28 Oct 2024 13:25:25 +0200 Subject: [PATCH] oshmem/shmem: Allocate and exchange base segment address beforehand Signed-off-by: Thomas Vegas --- oshmem/mca/memheap/base/memheap_base_select.c | 138 +++++++++++++++++- oshmem/mca/memheap/base/memheap_base_static.c | 38 ++++- oshmem/mca/sshmem/base/sshmem_base_open.c | 12 +- oshmem/mca/sshmem/mmap/sshmem_mmap_module.c | 1 + oshmem/mca/sshmem/sysv/sshmem_sysv_module.c | 1 + oshmem/runtime/oshmem_shmem_exchange.c | 9 ++ oshmem/runtime/runtime.h | 5 + 7 files changed, 180 insertions(+), 24 deletions(-) diff --git a/oshmem/mca/memheap/base/memheap_base_select.c b/oshmem/mca/memheap/base/memheap_base_select.c index c32f799d373..a61d014a1c6 100644 --- a/oshmem/mca/memheap/base/memheap_base_select.c +++ b/oshmem/mca/memheap/base/memheap_base_select.c @@ -25,6 +25,8 @@ #include "oshmem/mca/sshmem/base/base.h" #include "ompi/util/timings.h" +#include + mca_memheap_base_config_t mca_memheap_base_config = { .device_nic_mem_seg_size = 0 }; @@ -106,6 +108,127 @@ static size_t _memheap_size(void) return (size_t) memheap_align(oshmem_shmem_info_env.symmetric_heap_size); } +static void *memheap_mmap_get(void *hint, size_t size) +{ + void *addr; + + addr = mmap(hint, size, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + return NULL; + } + + return addr; +} + +static int memheap_exchange_base_address(size_t size, void **address) +{ + int nprocs = oshmem_num_procs(); + void *base = NULL; + void *ptr = NULL; + int rc, i; + void **bases; + + bases = calloc(nprocs, sizeof(*bases)); + if (NULL == bases) { + return OSHMEM_ERROR; + } + + if (oshmem_my_proc_id() == 0) { + ptr = memheap_mmap_get(NULL, size); + base = ptr; + } + + rc = oshmem_shmem_bcast(&base, sizeof(base), 0); + if (OSHMEM_SUCCESS != rc) { + MEMHEAP_ERROR("Failed to exchange allocated vma for base segment " + "(error %d)", rc); + goto out; + } + + if (oshmem_my_proc_id() != 0) { + ptr = memheap_mmap_get(base, size); + } + + MEMHEAP_VERBOSE(100, "#%d: exchange base address: base %p: %s", + oshmem_my_proc_id(), base, + (NULL != ptr)? "ok" : "unavailable"); + + rc = oshmem_shmem_allgather(&ptr, bases, sizeof(ptr)); + if (OSHMEM_SUCCESS != rc) { + MEMHEAP_ERROR("Failed to exchange selected vma for base segment " + "(error %d)", rc); + goto out; + } + + *address = base; + for (i = 0; i < nprocs; i++) { + if ((NULL == bases[i]) || (bases[i] != base)) { + *address = NULL; + break; + } + } + +out: + if (((OSHMEM_SUCCESS != rc) || (*address == NULL)) && (NULL != ptr)) { + (void)munmap(ptr, size); + } + + free(bases); + return rc; +} + + +/* + * The returned mca_sshmem_base_start_address value is reserved by using + * mmap() for the expected size. + */ +static int memheap_base_segment_setup(size_t size) +{ + int rc; + + if (mca_sshmem_base_start_address == (void *)UINTPTR_MAX) { + if (UINTPTR_MAX == 0xFFFFFFFF) { + /** + * if 32 bit we set sshmem_base_start_adress to 0 + * to let OS allocate segment automatically + */ + mca_sshmem_base_start_address = NULL; + return OSHMEM_SUCCESS; + } + + rc = memheap_exchange_base_address(size, &mca_sshmem_base_start_address); + if (OSHMEM_SUCCESS != rc) { + MEMHEAP_ERROR("Failed to setup base segment address (error %d)", rc); + return rc; + } + + if (NULL != mca_sshmem_base_start_address) { + goto done; /* Region is reserved */ + } + +#if defined(__aarch64__) + mca_sshmem_base_start_address = (void*)0xAB0000000000; +#else + mca_sshmem_base_start_address = (void*)0xFF000000; +#endif + } + + if (NULL == memheap_mmap_get(mca_sshmem_base_start_address, size)) { + MEMHEAP_ERROR("Failed to create segment address %p/%zu", + mca_sshmem_base_start_address, size); + return OSHMEM_ERROR; + } + +done: + if (oshmem_my_proc_id() == 0) { + MEMHEAP_VERBOSE(10, "Using symmetric segment address %p/%zu", + mca_sshmem_base_start_address, size); + } + + return OSHMEM_SUCCESS; +} + static memheap_context_t* _memheap_create(void) { int rc = OSHMEM_SUCCESS; @@ -123,13 +246,18 @@ static memheap_context_t* _memheap_create(void) OPAL_TIMING_ENV_NEXT(timing, "_memheap_size()"); - /* Inititialize symmetric area */ - if (OSHMEM_SUCCESS == rc) { - rc = mca_memheap_base_alloc_init(&mca_memheap_base_map, - user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0, - "regular_mem"); + /* Locate and reserve symmetric area */ + rc = memheap_base_segment_setup(user_size + MEMHEAP_BASE_PRIVATE_SIZE); + if (OSHMEM_SUCCESS != rc) { + MEMHEAP_ERROR("Failed to negotiate base segment addres"); + return NULL; } + /* Inititialize symmetric area */ + rc = mca_memheap_base_alloc_init(&mca_memheap_base_map, + user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0, + "regular_mem"); + OPAL_TIMING_ENV_NEXT(timing, "mca_memheap_base_alloc_init()"); /* Initialize atomic symmetric area */ diff --git a/oshmem/mca/memheap/base/memheap_base_static.c b/oshmem/mca/memheap/base/memheap_base_static.c index 1b3819b87de..9b50132fbd3 100644 --- a/oshmem/mca/memheap/base/memheap_base_static.c +++ b/oshmem/mca/memheap/base/memheap_base_static.c @@ -25,6 +25,9 @@ #include static int _check_perms(const char *perm); +static int _check_non_static_segment(const map_segment_t *mem_segs, + int n_segment, + const void *start, const void *end); static int _check_address(void *start, void **end); static int _check_pathname(uint64_t inode, const char *pathname); @@ -32,6 +35,7 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map) { /* read and parse segments from /proc/self/maps */ int ret = OSHMEM_SUCCESS; + int n_segments = map->n_segments; uint64_t total_mem = 0; void* start; void* end; @@ -54,14 +58,6 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map) return OSHMEM_ERROR; } -#ifdef __linux__ - extern unsigned _end; - if (mca_sshmem_base_start_address < (uintptr_t)&_end) { - MEMHEAP_WARN("sshmem base start address is inside data region" - " (%p < %p)", mca_sshmem_base_start_address, &_end); - } -#endif - while (NULL != fgets(line, sizeof(line), fp)) { if (3 > sscanf(line, "%llx-%llx %s %llx %s %llx %s", @@ -77,6 +73,12 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map) goto out; } + if (OSHMEM_ERROR == _check_non_static_segment( + map->mem_segs, n_segments, + start, end)) { + continue; + } + if (OSHMEM_ERROR == _check_address(start, &end)) continue; @@ -138,6 +140,26 @@ static int _check_perms(const char *perms) return OSHMEM_ERROR; } +static int _check_non_static_segment(const map_segment_t *mem_segs, + int n_segment, + const void *start, const void *end) +{ + int i; + + for (i = 0; i < n_segment; i++) { + if ((start <= mem_segs[i].super.va_base) && + (mem_segs[i].super.va_base < end)) { + MEMHEAP_VERBOSE(100, + "non static segment: %p-%p already exists as %p-%p", + start, end, mem_segs[i].super.va_base, + mem_segs[i].super.va_end); + return OSHMEM_ERROR; + } + } + + return OSHMEM_SUCCESS; +} + static int _check_address(void *start, void **end) { /* FIXME Linux specific code */ diff --git a/oshmem/mca/sshmem/base/sshmem_base_open.c b/oshmem/mca/sshmem/base/sshmem_base_open.c index 4f8fe9832e3..1f0d1eb761e 100644 --- a/oshmem/mca/sshmem/base/sshmem_base_open.c +++ b/oshmem/mca/sshmem/base/sshmem_base_open.c @@ -31,17 +31,7 @@ * globals */ -/** - * if 32 bit we set sshmem_base_start_address to 0 - * to let OS allocate segment automatically - */ -#if UINTPTR_MAX == 0xFFFFFFFF -void *mca_sshmem_base_start_address = (void*)0; -#elif defined(__aarch64__) -void* mca_sshmem_base_start_address = (void*)0xAB0000000000; -#else -void* mca_sshmem_base_start_address = (void*)0xFF000000; -#endif +void *mca_sshmem_base_start_address = UINTPTR_MAX; char * mca_sshmem_base_backing_file_dir = NULL; diff --git a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c index bb32c07ca42..b1ed6b57a09 100644 --- a/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c +++ b/oshmem/mca/sshmem/mmap/sshmem_mmap_module.c @@ -126,6 +126,7 @@ segment_create(map_segment_t *ds_buf, /* init the contents of map_segment_t */ shmem_ds_reset(ds_buf); + (void)munmap(mca_sshmem_base_start_address, size); addr = mmap((void *)mca_sshmem_base_start_address, size, PROT_READ | PROT_WRITE, diff --git a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c index b58d4b3535f..5dc6d429fff 100644 --- a/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c +++ b/oshmem/mca/sshmem/sysv/sshmem_sysv_module.c @@ -170,6 +170,7 @@ segment_create(map_segment_t *ds_buf, } /* Attach to the segment */ + (void)munmap(mca_sshmem_base_start_address, size); addr = shmat(shmid, (void *) mca_sshmem_base_start_address, 0); if (addr == (void *) -1L) { opal_show_help("help-oshmem-sshmem.txt", diff --git a/oshmem/runtime/oshmem_shmem_exchange.c b/oshmem/runtime/oshmem_shmem_exchange.c index 730eaef2e46..147340e53c4 100644 --- a/oshmem/runtime/oshmem_shmem_exchange.c +++ b/oshmem/runtime/oshmem_shmem_exchange.c @@ -16,6 +16,15 @@ #include "oshmem/runtime/runtime.h" #include "oshmem/runtime/params.h" +int oshmem_shmem_bcast(void *buf, int elem_size, int root) +{ + int rc; + + rc = PMPI_Bcast(buf, elem_size, MPI_BYTE, root, oshmem_comm_world); + + return rc; +} + int oshmem_shmem_allgather(void *send_buf, void *rcv_buf, int elem_size) { int rc; diff --git a/oshmem/runtime/runtime.h b/oshmem/runtime/runtime.h index 957777a0198..02d60a8c137 100644 --- a/oshmem/runtime/runtime.h +++ b/oshmem/runtime/runtime.h @@ -128,6 +128,11 @@ int oshmem_shmem_finalize(void); */ OSHMEM_DECLSPEC int oshmem_shmem_abort(int errcode); +/** + * Broadcast between all PEs + */ +OSHMEM_DECLSPEC int oshmem_shmem_bcast(void *buf, int elem_size, int root); + /** * Allgather between all PEs */