From 331c616324e8df64a3373090794d25ded2f476cf Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Fri, 7 Jul 2023 01:39:51 +0000 Subject: [PATCH] prov/efa: Modify how EFA selects new intranode provider Change the env variable that controls EFA's intranode provider from FI_EFA_USE_SM2 to FI_EFA_INTRANODE_PROVIDER which makes it more generic. This switches the variable from a bool to a string, and allows it to also take the place of FI_EFA_ENANLE_SHM_TRANSFER, which we deprecate in this commit. When someone sets FI_EFA_INTRANODE_PROVIDER=efa, we turn EFA's intranode optimization off. FI_EFA_INTRANODE_PROVIDER will override the value of FI_EFA_ENANLE_SHM_TRANSFER. Signed-off-by: Seth Zegelstein --- prov/efa/src/efa_domain.c | 5 ++++- prov/efa/src/efa_env.c | 29 +++++++++++++++++++++++------ prov/efa/src/efa_env.h | 2 +- prov/efa/src/efa_prov_info.c | 2 +- prov/efa/src/efa_shm.c | 18 ++++++------------ prov/efa/src/rdm/efa_rdm_ep_fiops.c | 8 ++++++-- 6 files changed, 41 insertions(+), 23 deletions(-) diff --git a/prov/efa/src/efa_domain.c b/prov/efa/src/efa_domain.c index dfff0172b9d..a88ee4faab0 100644 --- a/prov/efa/src/efa_domain.c +++ b/prov/efa/src/efa_domain.c @@ -129,7 +129,10 @@ static int efa_domain_init_rdm(struct efa_domain *efa_domain, struct fi_info *in { int err; - efa_shm_info_create(info, &efa_domain->shm_info); + if (strcmp(efa_env.intranode_provider, "efa")) + efa_shm_info_create(info, &efa_domain->shm_info); + else + efa_domain->shm_info = NULL; if (efa_domain->shm_info) { err = fi_fabric(efa_domain->shm_info->fabric_attr, diff --git a/prov/efa/src/efa_env.c b/prov/efa/src/efa_env.c index 585537148a7..48694636ff8 100644 --- a/prov/efa/src/efa_env.c +++ b/prov/efa/src/efa_env.c @@ -67,8 +67,8 @@ struct efa_env efa_env = { .efa_write_segment_size = 1073741824, /* need to confirm this constant. */ .rnr_retry = 3, /* Setting this value to EFA_RNR_INFINITE_RETRY makes the firmware retry indefinitey */ .host_id_file = "/sys/devices/virtual/dmi/id/board_asset_tag", /* Available on EC2 instances and containers */ - .use_sm2 = false, .huge_page_setting = EFA_ENV_HUGE_PAGE_UNSPEC, + .intranode_provider = "shm", }; /** @@ -127,7 +127,6 @@ void efa_env_param_get(void) } fi_param_get_int(&efa_prov, "tx_queue_size", &efa_env.tx_queue_size); - fi_param_get_int(&efa_prov, "enable_shm_transfer", &efa_env.enable_shm_transfer); fi_param_get_int(&efa_prov, "use_zcpy_rx", &efa_env.use_zcpy_rx); fi_param_get_int(&efa_prov, "set_cuda_sync_memops", &efa_env.set_cuda_sync_memops); fi_param_get_int(&efa_prov, "zcpy_rx_seed", &efa_env.zcpy_rx_seed); @@ -164,13 +163,30 @@ void efa_env_param_get(void) &efa_env.efa_read_segment_size); fi_param_get_size_t(&efa_prov, "inter_max_gdrcopy_message_size", &efa_env.efa_max_gdrcopy_msg_size); - fi_param_get_bool(&efa_prov, "use_sm2", &efa_env.use_sm2); int use_huge_page; if (fi_param_get_bool(&efa_prov, "use_huge_page", &use_huge_page) ==0) { efa_env.huge_page_setting = use_huge_page ? EFA_ENV_HUGE_PAGE_ENABLED : EFA_ENV_HUGE_PAGE_DISABLED; } + fi_param_get_int(&efa_prov, "enable_shm_transfer", &efa_env.enable_shm_transfer); + if (efa_env.enable_shm_transfer == 0) { + efa_env.intranode_provider = "efa"; + EFA_WARN(FI_LOG_CORE, "FI_EFA_ENABLE_SHM_TRANSFER is deprecated (and will be removed in a future release), " + "use FI_EFA_INTRANODE_PROVIDER=efa to specify the EFA provider for intra-node communication.\n"); + } + + /* Setting FI_EFA_INTRANODE_PROVIDER will override FI_EFA_ENABLE_SHM_TRANSFER=0 */ + fi_param_get_str(&efa_prov, "intranode_provider", &efa_env.intranode_provider); + if (strcmp(efa_env.intranode_provider, "efa") && + strcmp(efa_env.intranode_provider, "shm") && + strcmp(efa_env.intranode_provider, "sm2")) { + EFA_WARN(FI_LOG_CORE, "FI_EFA_INTRANODE_PROVIDER=%s, EFA supports 'shm', 'sm2' and 'efa'" + " for intra-node communication. Unsupported provider name. Aborting...\n", + efa_env.intranode_provider); + abort(); + } + efa_fork_support_request_initialize(); } @@ -181,8 +197,9 @@ void efa_env_define() "Defines the minimum number of credits a sender requests from a receiver (Default: 32)."); fi_param_define(&efa_prov, "tx_queue_size", FI_PARAM_INT, "Defines the maximum number of unacknowledged sends with the NIC."); + /* TODO Remove enable_shm_transfer on future release */ fi_param_define(&efa_prov, "enable_shm_transfer", FI_PARAM_INT, - "Enable using SHM provider to perform TX operations between processes on the same system. (Default: 1)"); + "(Deprecated, use FI_EFA_INTRANODE_PROVIDER=efa to turn off SHM. Will remove in future release.) Enable using SHM provider to perform TX operations between processes on the same system. (Default: 1)"); fi_param_define(&efa_prov, "use_zcpy_rx", FI_PARAM_INT, "Enables the use of application's receive buffers in place of bounce-buffers when feasible. (Default: 1)"); fi_param_define(&efa_prov, "set_cuda_sync_memops", FI_PARAM_INT, @@ -235,13 +252,13 @@ void efa_env_define() "Enables fork support and disables internal usage of huge pages. Has no effect on kernels which set copy-on-fork for registered pages, generally 5.13 and later. (Default: false)"); fi_param_define(&efa_prov, "runt_size", FI_PARAM_INT, "The maximum number of bytes that will be eagerly sent by inflight messages uses runting read message protocol (Default 307200)."); - fi_param_define(&efa_prov, "use_sm2", FI_PARAM_BOOL, - "Use the experimental shared memory provider SM2 for intra node communication."); fi_param_define(&efa_prov, "use_huge_page", FI_PARAM_BOOL, "Whether EFA provider can use huge page memory for internal buffer. " "Using huge page memory has a small performance advantage, but can " "cause system to run out of huge page memory. By default, EFA provider " "will use huge page unless FI_EFA_FORK_SAFE is set to 1/on/true."); + fi_param_define(&efa_prov, "intranode_provider", FI_PARAM_STRING, + "The name of the provider that EFA should offload intra-node communications to (Default shm)."); } diff --git a/prov/efa/src/efa_env.h b/prov/efa/src/efa_env.h index 3dc86f419f6..e8130a0f23a 100644 --- a/prov/efa/src/efa_env.h +++ b/prov/efa/src/efa_env.h @@ -107,8 +107,8 @@ struct efa_env { * is malformatted, the program should proceed with a default host id, e.g. 0. */ char *host_id_file; - int use_sm2; enum efa_env_huge_page_setting huge_page_setting; + char *intranode_provider; }; /** diff --git a/prov/efa/src/efa_prov_info.c b/prov/efa/src/efa_prov_info.c index 8d346dac986..f2359580b0c 100644 --- a/prov/efa/src/efa_prov_info.c +++ b/prov/efa/src/efa_prov_info.c @@ -649,7 +649,7 @@ int efa_prov_info_alloc_for_rdm(struct fi_info **prov_info_rdm_ptr, * then send the packet entry. Therefore the maximum inject size is * pkt_entry_size - maximum_header_size. */ - if (efa_env.enable_shm_transfer) + if (strcmp(efa_env.intranode_provider, "efa")) min_pkt_size = MIN(device->rdm_info->ep_attr->max_msg_size, efa_env.shm_max_medium_size); else min_pkt_size = device->rdm_info->ep_attr->max_msg_size; diff --git a/prov/efa/src/efa_shm.c b/prov/efa/src/efa_shm.c index aaabace6c10..ec82b8a6cef 100644 --- a/prov/efa/src/efa_shm.c +++ b/prov/efa/src/efa_shm.c @@ -98,13 +98,6 @@ void efa_shm_info_create(const struct fi_info *app_info, struct fi_info **shm_in int ret; struct fi_info *shm_hints; - char *shm_provider; - if (efa_env.use_sm2) { - shm_provider = "sm2"; - } else { - shm_provider = "shm"; - } - shm_hints = fi_allocinfo(); shm_hints->caps = app_info->caps; shm_hints->caps &= ~FI_REMOTE_COMM; @@ -133,18 +126,19 @@ void efa_shm_info_create(const struct fi_info *app_info, struct fi_info **shm_in */ shm_hints->tx_attr->op_flags = FI_COMPLETION; shm_hints->rx_attr->op_flags = FI_COMPLETION; - shm_hints->fabric_attr->name = strdup(shm_provider); - shm_hints->fabric_attr->prov_name = strdup(shm_provider); + shm_hints->fabric_attr->name = strdup(efa_env.intranode_provider); + shm_hints->fabric_attr->prov_name = strdup(efa_env.intranode_provider); shm_hints->ep_attr->type = FI_EP_RDM; ret = fi_getinfo(FI_VERSION(1, 19), NULL, NULL, OFI_GETINFO_HIDDEN, shm_hints, shm_info); fi_freeinfo(shm_hints); if (ret) { - EFA_WARN(FI_LOG_CORE, "Disabling EFA shared memory support; failed to get shm provider's info: %s\n", - fi_strerror(-ret)); + EFA_WARN(FI_LOG_CORE, "Disabling EFA's shared memory support; " + "Failed to get info struct for provider %s: %s\n", + efa_env.intranode_provider, fi_strerror(-ret)); *shm_info = NULL; } else { - assert(!strcmp((*shm_info)->fabric_attr->name, shm_provider)); + assert(!strcmp((*shm_info)->fabric_attr->name, efa_env.intranode_provider)); } } diff --git a/prov/efa/src/rdm/efa_rdm_ep_fiops.c b/prov/efa/src/rdm/efa_rdm_ep_fiops.c index df2dbfc2849..466e65edef0 100644 --- a/prov/efa/src/rdm/efa_rdm_ep_fiops.c +++ b/prov/efa/src/rdm/efa_rdm_ep_fiops.c @@ -113,7 +113,7 @@ void efa_rdm_pke_pool_mr_dereg_handler(struct ofi_bufpool_region *region) /** * @brief creates a packet entry pool. - * + * * The pool is allowed to grow if * max_cnt is 0 and is fixed size otherwise. * @@ -936,7 +936,11 @@ void efa_rdm_ep_set_use_shm_for_tx(struct efa_rdm_ep *ep) return; } - ep->use_shm_for_tx = efa_env.enable_shm_transfer; + if (strcmp(efa_env.intranode_provider, "efa")) + ep->use_shm_for_tx = true; + else + ep->use_shm_for_tx = false; + return; }