Skip to content

Commit

Permalink
Merge pull request #12354 from edgargabriel/topic/smcuda-delayed-sae-…
Browse files Browse the repository at this point in the history
…init

btl/smcuda: add delayed stream initialization
  • Loading branch information
edgargabriel authored Mar 6, 2024
2 parents ecd1f42 + 835eef5 commit a25dd5f
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 25 deletions.
4 changes: 4 additions & 0 deletions opal/mca/btl/smcuda/btl_smcuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
* Copyright (c) 2010-2015 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2012-2023 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -204,6 +205,9 @@ struct mca_btl_smcuda_component_t {
int use_cuda_ipc;
int use_cuda_ipc_same_gpu;

int accelerator_delayed_ipc_init;
int accelerator_max_ipc_events;

unsigned long mpool_min_size;
char *allocator;
};
Expand Down
78 changes: 53 additions & 25 deletions opal/mca/btl/smcuda/btl_smcuda_accelerator.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* Copyright (c) 2023 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -33,62 +34,80 @@ static int accelerator_event_ipc_first_used;
static volatile int accelerator_event_ipc_num_used;

/* Size of array holding events */
static int accelerator_event_max = 400;
static int accelerator_event_ipc_most = 0;
static bool smcuda_accelerator_initialized = false;

void mca_btl_smcuda_accelerator_fini(void);

int mca_btl_smcuda_accelerator_init(void)
/* Initialize the internal ipc stream and the events (s&e) */
static int mca_btl_smcuda_accelerator_ipc_init(void)
{
int rc = OPAL_SUCCESS;
int i;
OBJ_CONSTRUCT(&btl_smcuda_accelerator_ipc_lock, opal_mutex_t);
/* The first available status index is 0. Make an empty frag
array. */
int device_id;

rc = opal_accelerator.get_device(&device_id);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Failed to retrieve current device.");
return OPAL_ERROR;
}

rc = opal_accelerator.create_stream(MCA_ACCELERATOR_NO_DEVICE_ID, &ipc_stream);
rc = opal_accelerator.create_stream(device_id, &ipc_stream);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Failed to create accelerator ipc_stream stream.");
goto cleanup_and_error;
return OPAL_ERROR;
}

/* Create the events since they can be reused. */
for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) {
rc = opal_accelerator.create_event(device_id, &accelerator_event_ipc_array[i], opal_accelerator_use_sync_memops ? false : true);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Accelerator create event failed.");
return OPAL_ERROR;
}
}

return OPAL_SUCCESS;
}

int mca_btl_smcuda_accelerator_init(void)
{
int rc = OPAL_SUCCESS;
int i;

OBJ_CONSTRUCT(&btl_smcuda_accelerator_ipc_lock, opal_mutex_t);

accelerator_event_ipc_num_used = 0;
accelerator_event_ipc_first_avail = 0;
accelerator_event_ipc_first_used = 0;

accelerator_event_ipc_array = calloc(accelerator_event_max, sizeof(opal_accelerator_event_t *));
accelerator_event_ipc_array = calloc(mca_btl_smcuda_component.accelerator_max_ipc_events, sizeof(opal_accelerator_event_t *));
if (NULL == accelerator_event_ipc_array) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "No memory.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}
/* Create the events since they can be reused. */
for (i = 0; i < accelerator_event_max; i++) {
rc = opal_accelerator.create_event(MCA_ACCELERATOR_NO_DEVICE_ID, &accelerator_event_ipc_array[i], opal_accelerator_use_sync_memops ? false : true);
if (OPAL_SUCCESS != rc) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "Accelerator create event failed.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}
}

/* The first available status index is 0. Make an empty frag
array. */

accelerator_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc(sizeof(struct mca_btl_base_descriptor_t *) * accelerator_event_max);
accelerator_event_ipc_frag_array = (struct mca_btl_base_descriptor_t **) malloc(sizeof(struct mca_btl_base_descriptor_t *) *
mca_btl_smcuda_component.accelerator_max_ipc_events);
if (NULL == accelerator_event_ipc_frag_array) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "No memory.");
rc = OPAL_ERROR;
goto cleanup_and_error;
}

if (!mca_btl_smcuda_component.accelerator_delayed_ipc_init) {
mca_btl_smcuda_accelerator_ipc_init();
}

smcuda_accelerator_initialized = true;

cleanup_and_error:
if (OPAL_SUCCESS != rc) {
if (NULL != accelerator_event_ipc_array) {
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) {
if (NULL != accelerator_event_ipc_array[i]) {
OBJ_RELEASE(accelerator_event_ipc_array[i]);
}
Expand Down Expand Up @@ -117,7 +136,7 @@ void mca_btl_smcuda_accelerator_fini(void)
}

if (NULL != accelerator_event_ipc_array) {
for (i = 0; i < accelerator_event_max; i++) {
for (i = 0; i < mca_btl_smcuda_component.accelerator_max_ipc_events; i++) {
if (NULL != accelerator_event_ipc_array[i]) {
OBJ_RELEASE(accelerator_event_ipc_array[i]);
}
Expand All @@ -129,7 +148,9 @@ void mca_btl_smcuda_accelerator_fini(void)
free(accelerator_event_ipc_frag_array);
}

OBJ_RELEASE(ipc_stream);
if (NULL != ipc_stream) {
OBJ_RELEASE(ipc_stream);
}

OBJ_DESTRUCT(&btl_smcuda_accelerator_ipc_lock);
smcuda_accelerator_initialized = false;
Expand Down Expand Up @@ -175,7 +196,7 @@ int mca_btl_smcuda_progress_one_ipc_event(struct mca_btl_base_descriptor_t **fra
/* Bump counters, loop around the circular buffer if necessary */
--accelerator_event_ipc_num_used;
++accelerator_event_ipc_first_used;
if (accelerator_event_ipc_first_used >= accelerator_event_max) {
if (accelerator_event_ipc_first_used >= mca_btl_smcuda_component.accelerator_max_ipc_events) {
accelerator_event_ipc_first_used = 0;
}
/* A return value of 1 indicates an event completed and a frag was returned */
Expand All @@ -196,10 +217,17 @@ int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg,
int result;
OPAL_THREAD_LOCK(&btl_smcuda_accelerator_ipc_lock);

if (NULL == ipc_stream) {
result = mca_btl_smcuda_accelerator_ipc_init();
if (OPAL_SUCCESS != result) {
return result;
}
}

/* First make sure there is room to store the event. If not, then
* return an error. The error message will tell the user to try and
* run again, but with a larger array for storing events. */
if (accelerator_event_ipc_num_used == accelerator_event_max) {
if (accelerator_event_ipc_num_used == mca_btl_smcuda_component.accelerator_max_ipc_events) {
opal_output_verbose(1, mca_btl_smcuda_component.cuda_ipc_output, "smcuda: Out of event handles");
OPAL_THREAD_UNLOCK(&btl_smcuda_accelerator_ipc_lock);
return OPAL_ERR_OUT_OF_RESOURCE;
Expand Down Expand Up @@ -237,7 +265,7 @@ int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg,

/* Bump up the first available slot and number used by 1 */
accelerator_event_ipc_first_avail++;
if (accelerator_event_ipc_first_avail >= accelerator_event_max) {
if (accelerator_event_ipc_first_avail >= mca_btl_smcuda_component.accelerator_max_ipc_events) {
accelerator_event_ipc_first_avail = 0;
}
accelerator_event_ipc_num_used++;
Expand Down
17 changes: 17 additions & 0 deletions opal/mca/btl/smcuda/btl_smcuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
* Copyright (c) 2022 IBM Corporation. All rights reserved.
* Copyright (c) 2023 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -169,6 +170,22 @@ static int smcuda_register(void)
mca_btl_smcuda_param_register_uint("fifo_lazy_free", 120, OPAL_INFO_LVL_5,
&mca_btl_smcuda_component.fifo_lazy_free);

/* Delay the creation of the IPC stream and events. This has the advantage of also
* working in scenarios where the user did not set the accelerator device
* before MPI_Init AND the stream/event has internally some reference to the device
* used at that time */
mca_btl_smcuda_component.accelerator_delayed_ipc_init = 1;
(void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "delayed_stream_init",
"Delay the initialization of the ipc stream and internal events",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_smcuda_component.accelerator_delayed_ipc_init);

mca_btl_smcuda_component.accelerator_max_ipc_events = 400;
(void) mca_base_component_var_register(&mca_btl_smcuda_component.super.btl_version, "max_ipc_events",
"Number of events created by the smcuda components internally",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY, &mca_btl_smcuda_component.accelerator_max_ipc_events);

/* default number of extra procs to allow for future growth */
mca_btl_smcuda_param_register_int("sm_extra_procs", 0, OPAL_INFO_LVL_9,
&mca_btl_smcuda_component.sm_extra_procs);
Expand Down

0 comments on commit a25dd5f

Please sign in to comment.