Skip to content

Commit

Permalink
oshmem/shmem: Allocate and exchange base segment address beforehand
Browse files Browse the repository at this point in the history
Signed-off-by: Thomas Vegas <[email protected]>
  • Loading branch information
tvegas1 committed Oct 28, 2024
1 parent c393881 commit 019badb
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 24 deletions.
139 changes: 134 additions & 5 deletions oshmem/mca/memheap/base/memheap_base_select.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "oshmem/mca/sshmem/base/base.h"
#include "ompi/util/timings.h"

#include <sys/mman.h>

mca_memheap_base_config_t mca_memheap_base_config = {
.device_nic_mem_seg_size = 0
};
Expand Down Expand Up @@ -106,6 +108,128 @@ static size_t _memheap_size(void)
return (size_t) memheap_align(oshmem_shmem_info_env.symmetric_heap_size);
}

static void *memheap_mmap_get(void *hint, size_t size)
{
void *addr;

addr = mmap(hint, size,
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED) {
return NULL;
}

return addr;
}

static int memheap_exchange_base_address(size_t size, void **address)
{
int nprocs = oshmem_num_procs();
void *base = NULL;
void *ptr = NULL;
int rc, i;
void **bases;

bases = calloc(nprocs, sizeof(*bases));
if (NULL == bases) {
return OSHMEM_ERROR;
}

if (oshmem_my_proc_id() == 0) {
ptr = memheap_mmap_get(NULL, size);
base = ptr;
}

rc = oshmem_shmem_bcast(&base, sizeof(base), 0);
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to exchange allocated vma for base segment "
"(error %d)", rc);
goto out;
}

if (oshmem_my_proc_id() != 0) {
ptr = memheap_mmap_get(base, size);
}

MEMHEAP_VERBOSE(100, "#%d: exchange base address: base %p: %s",
oshmem_my_proc_id(), base,
(base == ptr)? "ok" : "unavailable");

rc = oshmem_shmem_allgather(&ptr, bases, sizeof(ptr));
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to exchange selected vma for base segment "
"(error %d)", rc);
goto out;
}

*address = base;
for (i = 0; i < nprocs; i++) {
if ((NULL == bases[i]) || (bases[i] != base)) {
*address = NULL;
break;
}
}

out:
if (((OSHMEM_SUCCESS != rc) || (*address == NULL)) && (NULL != ptr)) {
(void)munmap(ptr, size);
}

free(bases);
return rc;
}


/*
* The returned mca_sshmem_base_start_address value is reserved by using
* mmap() for the expected size.
*/
static int memheap_base_segment_setup(size_t size)
{
int rc;

if (mca_sshmem_base_start_address == (void *)UINTPTR_MAX) {
if (UINTPTR_MAX == 0xFFFFFFFF) {
/**
* if 32 bit we set sshmem_base_start_adress to 0
* to let OS allocate segment automatically
*/
mca_sshmem_base_start_address = NULL;
return OSHMEM_SUCCESS;
}

rc = memheap_exchange_base_address(size, &mca_sshmem_base_start_address);
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to setup base segment address (error %d)", rc);
return rc;
}

if (NULL != mca_sshmem_base_start_address) {
goto done; /* Region is reserved */
}

#if defined(__aarch64__)
mca_sshmem_base_start_address = (void*)0xAB0000000000;
#else
mca_sshmem_base_start_address = (void*)0xFF000000;
#endif
}

if (mca_sshmem_base_start_address != memheap_mmap_get(
mca_sshmem_base_start_address, size)) {
MEMHEAP_ERROR("Failed to create segment address %p/%zu",
mca_sshmem_base_start_address, size);
return OSHMEM_ERROR;
}

done:
if (oshmem_my_proc_id() == 0) {
MEMHEAP_VERBOSE(10, "Using symmetric segment address %p/%zu",
mca_sshmem_base_start_address, size);
}

return OSHMEM_SUCCESS;
}

static memheap_context_t* _memheap_create(void)
{
int rc = OSHMEM_SUCCESS;
Expand All @@ -123,13 +247,18 @@ static memheap_context_t* _memheap_create(void)

OPAL_TIMING_ENV_NEXT(timing, "_memheap_size()");

/* Inititialize symmetric area */
if (OSHMEM_SUCCESS == rc) {
rc = mca_memheap_base_alloc_init(&mca_memheap_base_map,
user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0,
"regular_mem");
/* Locate and reserve symmetric area */
rc = memheap_base_segment_setup(user_size + MEMHEAP_BASE_PRIVATE_SIZE);
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to negotiate base segment addres");
return NULL;
}

/* Inititialize symmetric area */
rc = mca_memheap_base_alloc_init(&mca_memheap_base_map,
user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0,
"regular_mem");

OPAL_TIMING_ENV_NEXT(timing, "mca_memheap_base_alloc_init()");

/* Initialize atomic symmetric area */
Expand Down
38 changes: 30 additions & 8 deletions oshmem/mca/memheap/base/memheap_base_static.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,17 @@
#include <pthread.h>

static int _check_perms(const char *perm);
static int _check_non_static_segment(const map_segment_t *mem_segs,
int n_segment,
const void *start, const void *end);
static int _check_address(void *start, void **end);
static int _check_pathname(uint64_t inode, const char *pathname);

int mca_memheap_base_static_init(mca_memheap_map_t *map)
{
/* read and parse segments from /proc/self/maps */
int ret = OSHMEM_SUCCESS;
int n_segments = map->n_segments;
uint64_t total_mem = 0;
void* start;
void* end;
Expand All @@ -54,14 +58,6 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map)
return OSHMEM_ERROR;
}

#ifdef __linux__
extern unsigned _end;
if (mca_sshmem_base_start_address < (uintptr_t)&_end) {
MEMHEAP_WARN("sshmem base start address is inside data region"
" (%p < %p)", mca_sshmem_base_start_address, &_end);
}
#endif

while (NULL != fgets(line, sizeof(line), fp)) {
if (3 > sscanf(line,
"%llx-%llx %s %llx %s %llx %s",
Expand All @@ -77,6 +73,12 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map)
goto out;
}

if (OSHMEM_ERROR == _check_non_static_segment(
map->mem_segs, n_segments,
start, end)) {
continue;
}

if (OSHMEM_ERROR == _check_address(start, &end))
continue;

Expand Down Expand Up @@ -138,6 +140,26 @@ static int _check_perms(const char *perms)
return OSHMEM_ERROR;
}

static int _check_non_static_segment(const map_segment_t *mem_segs,
int n_segment,
const void *start, const void *end)
{
int i;

for (i = 0; i < n_segment; i++) {
if ((start <= mem_segs[i].super.va_base) &&
(mem_segs[i].super.va_base < end)) {
MEMHEAP_VERBOSE(100,
"non static segment: %p-%p already exists as %p-%p",
start, end, mem_segs[i].super.va_base,
mem_segs[i].super.va_end);
return OSHMEM_ERROR;
}
}

return OSHMEM_SUCCESS;
}

static int _check_address(void *start, void **end)
{
/* FIXME Linux specific code */
Expand Down
12 changes: 1 addition & 11 deletions oshmem/mca/sshmem/base/sshmem_base_open.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,7 @@
* globals
*/

/**
* if 32 bit we set sshmem_base_start_address to 0
* to let OS allocate segment automatically
*/
#if UINTPTR_MAX == 0xFFFFFFFF
void *mca_sshmem_base_start_address = (void*)0;
#elif defined(__aarch64__)
void* mca_sshmem_base_start_address = (void*)0xAB0000000000;
#else
void* mca_sshmem_base_start_address = (void*)0xFF000000;
#endif
void *mca_sshmem_base_start_address = UINTPTR_MAX;

char * mca_sshmem_base_backing_file_dir = NULL;

Expand Down
1 change: 1 addition & 0 deletions oshmem/mca/sshmem/mmap/sshmem_mmap_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ segment_create(map_segment_t *ds_buf,
/* init the contents of map_segment_t */
shmem_ds_reset(ds_buf);

(void)munmap(mca_sshmem_base_start_address, size);
addr = mmap((void *)mca_sshmem_base_start_address,
size,
PROT_READ | PROT_WRITE,
Expand Down
1 change: 1 addition & 0 deletions oshmem/mca/sshmem/sysv/sshmem_sysv_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ segment_create(map_segment_t *ds_buf,
}

/* Attach to the segment */
(void)munmap(mca_sshmem_base_start_address, size);
addr = shmat(shmid, (void *) mca_sshmem_base_start_address, 0);
if (addr == (void *) -1L) {
opal_show_help("help-oshmem-sshmem.txt",
Expand Down
9 changes: 9 additions & 0 deletions oshmem/runtime/oshmem_shmem_exchange.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@
#include "oshmem/runtime/runtime.h"
#include "oshmem/runtime/params.h"

int oshmem_shmem_bcast(void *buf, int elem_size, int root)
{
int rc;

rc = PMPI_Bcast(buf, elem_size, MPI_BYTE, root, oshmem_comm_world);

return rc;
}

int oshmem_shmem_allgather(void *send_buf, void *rcv_buf, int elem_size)
{
int rc;
Expand Down
5 changes: 5 additions & 0 deletions oshmem/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ int oshmem_shmem_finalize(void);
*/
OSHMEM_DECLSPEC int oshmem_shmem_abort(int errcode);

/**
* Broadcast between all PEs
*/
OSHMEM_DECLSPEC int oshmem_shmem_bcast(void *buf, int elem_size, int root);

/**
* Allgather between all PEs
*/
Expand Down

0 comments on commit 019badb

Please sign in to comment.