From 4c9732d680192cc0a466ddaf7df293a2d30e856f Mon Sep 17 00:00:00 2001 From: Lindsay Reiser Date: Tue, 16 Jul 2024 08:59:35 -0400 Subject: [PATCH] prov/opx: Add runtime parameters for SDMA, RZV, MP egr disable Add the ability for users to specify the minimum message length at which SDMA is used (FI_OPX_SDMA_MIN_PAYLOAD_BYTES), the minimum length at which rendezvous is used (OPX_RZV_MIN_PAYLOAD_BYTES), and a multi-packet eager option (FI_OPX_MP_EAGER_DISABLE) to enable or disable multi-packet eager. This allows the user to tune the default values. Tuning these parameters may have impacts to performance. Signed-off-by: Lindsay Reiser --- man/fi_opx.7.md | 13 +++ prov/opx/include/rdma/opx/fi_opx_domain.h | 6 +- prov/opx/include/rdma/opx/fi_opx_endpoint.h | 90 +++++++++++--------- prov/opx/include/rdma/opx/fi_opx_hfi1.h | 18 ++-- prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h | 2 +- prov/opx/src/fi_opx_domain.c | 28 +++--- prov/opx/src/fi_opx_ep.c | 69 +++++++++++---- prov/opx/src/fi_opx_hfi1.c | 2 +- prov/opx/src/fi_opx_init.c | 4 +- 9 files changed, 150 insertions(+), 82 deletions(-) diff --git a/man/fi_opx.7.md b/man/fi_opx.7.md index eae021110b5..4aa6f60a482 100644 --- a/man/fi_opx.7.md +++ b/man/fi_opx.7.md @@ -202,6 +202,19 @@ OPX is not compatible with Open MPI 4.1.x PML/BTL. *FI_OPX_SDMA_DISABLE* : Integer. Disables SDMA offload hardware. Default is 0 +*FI_OPX_SDMA_MIN_PAYLOAD_BYTES* +: Integer. The minimum length in bytes where SDMA will be used. + For messages smaller than this threshold, the send will be completed using PIO. + Value must be between 64 and 2147483646. Defaults to 16385. + +*FI_OPX_RZV_MIN_PAYLOAD_BYTES* +: Integer. The minimum length in bytes where rendezvous will be used. + For messages smaller than this threshold, the send will first try to be completed using eager or multi-packet eager. + Value must be between 64 and 65536. Defaults to 16385. + +*FI_OPX_MP_EAGER_DISABLE* +: Integer. Disables multi-packet eager. Defaults to 0. + *FI_OPX_EXPECTED_RECEIVE_ENABLE* : Boolean (0/1, on/off, true/false, yes/no). Enables expected receive rendezvous using Token ID (TID). Defaults to "No". This feature is not currently supported. diff --git a/prov/opx/include/rdma/opx/fi_opx_domain.h b/prov/opx/include/rdma/opx/fi_opx_domain.h index 2d60d4bcacc..9e902097df2 100644 --- a/prov/opx/include/rdma/opx/fi_opx_domain.h +++ b/prov/opx/include/rdma/opx/fi_opx_domain.h @@ -99,9 +99,9 @@ struct fi_opx_node { #define OPX_JOB_KEY_STR_SIZE 33 #define OPX_DEFAULT_JOB_KEY_STR "00112233445566778899aabbccddeeff" -#define OPX_SDMA_BOUNCE_BUF_MIN FI_OPX_SDMA_MIN_LENGTH -#define OPX_SDMA_BOUNCE_BUF_THRESHOLD FI_OPX_SDMA_DC_MIN -#define OPX_SDMA_BOUNCE_BUF_MAX (INT_MAX - 1) +#define OPX_SDMA_BOUNCE_BUF_MIN FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN +#define OPX_SDMA_BOUNCE_BUF_THRESHOLD FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT +#define OPX_SDMA_BOUNCE_BUF_MAX FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX struct fi_opx_domain { struct fid_domain domain_fid; diff --git a/prov/opx/include/rdma/opx/fi_opx_endpoint.h b/prov/opx/include/rdma/opx/fi_opx_endpoint.h index f31a3b7f70b..7c5c9a51160 100644 --- a/prov/opx/include/rdma/opx/fi_opx_endpoint.h +++ b/prov/opx/include/rdma/opx/fi_opx_endpoint.h @@ -270,7 +270,7 @@ struct fi_opx_ep_tx { uint64_t cq_bind_flags; struct fi_opx_context_slist * cq_completed_ptr; uint32_t do_cq_completion; - uint16_t mp_eager_max_payload_bytes; + uint16_t unused_cacheline1; uint8_t force_credit_return; uint8_t use_sdma; @@ -301,7 +301,10 @@ struct fi_opx_ep_tx { struct ofi_bufpool *rma_payload_pool; struct ofi_bufpool *rma_request_pool; struct ofi_bufpool *sdma_work_pool; - uint64_t unused_cacheline6[2]; + uint32_t sdma_min_payload_bytes; + uint32_t rzv_min_payload_bytes; + uint16_t mp_eager_max_payload_bytes; + uint8_t unused_cacheline6[6]; /* == CACHE LINE 7 == */ struct opx_sdma_queue sdma_request_queue; @@ -328,7 +331,7 @@ OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending) == (FI_OPX_C OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, work_pending_completion) == (FI_OPX_CACHE_LINE_SIZE * 6), "Offset of fi_opx_ep_tx->work_pending_completion should start at cacheline 6!"); OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, sdma_request_queue) == (FI_OPX_CACHE_LINE_SIZE * 7), - "Offset of fi_opx_ep_tx->ref_cnt should start at cacheline 7!"); + "Offset of fi_opx_ep_tx->sdma_request_queue should start at cacheline 7!"); OPX_COMPILE_TIME_ASSERT(offsetof(struct fi_opx_ep_tx, ref_cnt) == (FI_OPX_CACHE_LINE_SIZE * 8), "Offset of fi_opx_ep_tx->ref_cnt should start at cacheline 8!"); @@ -3899,7 +3902,8 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, const enum ofi_reliability_kind reliability, const uint64_t do_cq_completion, const enum fi_hmem_iface hmem_iface, - const uint64_t hmem_device) + const uint64_t hmem_device, + const bool mp_eager_fallback) { ssize_t rc; @@ -3922,7 +3926,7 @@ ssize_t fi_opx_ep_tx_send_try_eager(struct fid_ep *ep, if (OFI_LIKELY(rc == FI_SUCCESS)) { return rc; #ifndef FI_OPX_MP_EGR_DISABLE - } else if (rc == -FI_ENOBUFS && len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE) { + } else if (rc == -FI_ENOBUFS && mp_eager_fallback) { /* Insufficient credits. If the payload is big enough, fall back to Multi-packet eager to try sending this in smaller chunks. */ @@ -4116,48 +4120,52 @@ ssize_t fi_opx_ep_tx_send_internal (struct fid_ep *ep, const uint64_t do_cq_completion = fi_opx_ep_tx_do_cq_completion(opx_ep, override_flags, tx_op_flags); - if (total_len <= opx_ep->tx->pio_max_eager_tx_bytes) { - - rc = fi_opx_ep_tx_send_try_eager(ep, buf, len, desc, addr, tag, context, local_iov, - niov, total_len, data, lock_required, is_contiguous, - override_flags, tx_op_flags, caps, reliability, - do_cq_completion, hmem_iface, hmem_device); - if (OFI_LIKELY(rc == FI_SUCCESS)) { - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); - return rc; + if (total_len < opx_ep->tx->rzv_min_payload_bytes) { + const bool mp_eager_fallback = (total_len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE && + total_len <= opx_ep->tx->mp_eager_max_payload_bytes); + if (total_len <= opx_ep->tx->pio_max_eager_tx_bytes) { + + rc = fi_opx_ep_tx_send_try_eager(ep, buf, len, desc, addr, tag, context, local_iov, + niov, total_len, data, lock_required, is_contiguous, + override_flags, tx_op_flags, caps, reliability, + do_cq_completion, hmem_iface, hmem_device, + mp_eager_fallback); + if (OFI_LIKELY(rc == FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); + return rc; + } + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND -- Eager send failed, trying next method\n"); } - OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND"); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND -- Eager send failed, trying next method\n"); - } #ifndef FI_OPX_MP_EGR_DISABLE - /* If hmem_iface != FI_HMEM_SYSTEM, we skip MP EGR because RZV yields better performance for devices */ - if (is_contiguous && - total_len <= opx_ep->tx->mp_eager_max_payload_bytes && - total_len > FI_OPX_MP_EGR_CHUNK_PAYLOAD_SIZE && - !fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps) && - (caps & FI_TAGGED) && hmem_iface == FI_HMEM_SYSTEM) { - - rc = fi_opx_hfi1_tx_send_try_mp_egr(ep, buf, len, desc, addr.fi, tag, - context, data, lock_required, override_flags, - tx_op_flags, caps, reliability, do_cq_completion, - FI_HMEM_SYSTEM, 0ul); - if (OFI_LIKELY(rc == FI_SUCCESS)) { - OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); - return rc; + /* If hmem_iface != FI_HMEM_SYSTEM, we skip MP EGR because RZV yields better performance for devices */ + if (is_contiguous && + mp_eager_fallback && + !fi_opx_hfi1_tx_is_intranode(opx_ep, addr, caps) && + (caps & FI_TAGGED) && hmem_iface == FI_HMEM_SYSTEM) { + + rc = fi_opx_hfi1_tx_send_try_mp_egr(ep, buf, len, desc, addr.fi, tag, + context, data, lock_required, override_flags, + tx_op_flags, caps, reliability, do_cq_completion, + FI_HMEM_SYSTEM, 0ul); + if (OFI_LIKELY(rc == FI_SUCCESS)) { + OPX_TRACER_TRACE(OPX_TRACER_END_SUCCESS, "SEND"); + return rc; + } + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND -- MP-Eager send failed, trying next method\n"); } - OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN, "SEND"); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND -- MP-Eager send failed, trying next method\n"); - } #endif - if (OFI_UNLIKELY(total_len < FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES)) { - OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN,"SEND"); - FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, - "===================================== SEND -- FI_EAGAIN Can't do RZV with payload length = %ld\n",len); - return -FI_EAGAIN; + if (OFI_UNLIKELY(total_len < FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES)) { + OPX_TRACER_TRACE(OPX_TRACER_END_EAGAIN,"SEND"); + FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, + "===================================== SEND -- FI_EAGAIN Can't do RZV with payload length = %ld\n",len); + return -FI_EAGAIN; + } } rc = fi_opx_ep_tx_send_rzv(ep, diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1.h b/prov/opx/include/rdma/opx/fi_opx_hfi1.h index 8935ae86105..b59ee0054b4 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1.h @@ -87,6 +87,13 @@ #define OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT (16384) /* Default for max payload size for using Multi-packet Eager */ #define OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX (65535) /* Max value (set to fit within uint16_t) */ +#define OPX_MP_EGR_DISABLE_SET (1) +#define OPX_MP_EGR_DISABLE_NOT_SET (0) +#define OPX_MP_EGR_DISABLE_DEFAULT (OPX_MP_EGR_DISABLE_NOT_SET) + +#define OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT (OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT+1) /* Default for payload threshold size for RZV */ +#define OPX_RZV_MIN_PAYLOAD_BYTES_MAX (OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX+1) /* Max value */ +#define OPX_RZV_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) /* Min value */ /* The total size for a single packet used in a multi-packet eager send. This is packet payload plus 64 bytes for the PBC and packet header. @@ -176,15 +183,12 @@ static_assert(OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX >= OPX_MP_EGR_MAX_PAYLOAD_BYTES_D #define FI_OPX_HFI1_SDMA_MAX_COMP_INDEX (128) // This should what opx_ep->hfi->info.sdma.queue_size is set to. -#ifndef FI_OPX_SDMA_MIN_LENGTH -#define FI_OPX_SDMA_MIN_LENGTH (16385) +#ifndef FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT +#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT (16385) #endif +#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN (FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES) +#define FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX (INT_MAX-1) -/* - * The minimum payload size threshold for which we will use delivery completion - * instead of copying the payload for reliability. - */ -#define FI_OPX_SDMA_DC_MIN FI_OPX_SDMA_MIN_LENGTH static_assert(!(FI_OPX_HFI1_SDMA_MAX_COMP_INDEX & (FI_OPX_HFI1_SDMA_MAX_COMP_INDEX - 1)), "FI_OPX_HFI1_SDMA_MAX_COMP_INDEX must be power of 2!\n"); static_assert(FI_OPX_HFI1_SDMA_MAX_WE >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX, "FI_OPX_HFI1_SDMA_MAX_WE must be >= FI_OPX_HFI1_SDMA_MAX_COMP_INDEX!\n"); diff --git a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h index 944b7866e6e..e1653b1869b 100644 --- a/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h +++ b/prov/opx/include/rdma/opx/fi_opx_hfi1_sdma.h @@ -214,7 +214,7 @@ bool fi_opx_hfi1_sdma_use_sdma(struct fi_opx_ep *opx_ep, return !is_intranode && (is_hmem || opcode == FI_OPX_HFI_DPUT_OPCODE_RZV_TID - || total_bytes >= FI_OPX_SDMA_MIN_LENGTH) && + || total_bytes >= opx_ep->tx->sdma_min_payload_bytes) && opx_ep->tx->use_sdma; } diff --git a/prov/opx/src/fi_opx_domain.c b/prov/opx/src/fi_opx_domain.c index dbdebbcab95..8d85afa1afb 100644 --- a/prov/opx/src/fi_opx_domain.c +++ b/prov/opx/src/fi_opx_domain.c @@ -380,22 +380,26 @@ int fi_opx_domain(struct fid_fabric *fabric, size_t env_var_threshold; get_param_check = fi_param_get_size_t(fi_opx_global.prov, "dev_reg_send_threshold", &env_var_threshold); - if ((get_param_check == FI_SUCCESS) && (env_var_threshold <= OPX_HMEM_DEV_REG_THRESHOLD_MAX)) { - opx_domain->hmem_domain->devreg_copy_from_threshold = env_var_threshold; - } else { - FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN, - "FI_OPX_DEV_REG_SEND_THRESHOLD must be an integer >= %u and <= %u. Using default value (%u) instead of %zu\n", - OPX_HMEM_DEV_REG_THRESHOLD_MIN, OPX_HMEM_DEV_REG_THRESHOLD_MAX, OPX_HMEM_DEV_REG_SEND_THRESHOLD_DEFAULT, env_var_threshold); + if (get_param_check == FI_SUCCESS) { + if (env_var_threshold <= OPX_HMEM_DEV_REG_THRESHOLD_MAX) { + opx_domain->hmem_domain->devreg_copy_from_threshold = env_var_threshold; + } else { + FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN, + "FI_OPX_DEV_REG_SEND_THRESHOLD must be an integer >= %u and <= %u. Using default value (%u) instead of %zu\n", + OPX_HMEM_DEV_REG_THRESHOLD_MIN, OPX_HMEM_DEV_REG_THRESHOLD_MAX, OPX_HMEM_DEV_REG_SEND_THRESHOLD_DEFAULT, env_var_threshold); + } } get_param_check = fi_param_get_size_t(fi_opx_global.prov, "dev_reg_recv_threshold", &env_var_threshold); - if ((get_param_check == FI_SUCCESS) && (env_var_threshold <= OPX_HMEM_DEV_REG_THRESHOLD_MAX)) { - opx_domain->hmem_domain->devreg_copy_to_threshold = env_var_threshold; - } else { - FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN, - "FI_OPX_DEV_REG_RECV_THRESHOLD must be an integer >= %u and <= %u. Using default value (%u) instead of %zu\n", - OPX_HMEM_DEV_REG_THRESHOLD_MIN, OPX_HMEM_DEV_REG_THRESHOLD_MAX, OPX_HMEM_DEV_REG_RECV_THRESHOLD_DEFAULT, env_var_threshold); + if (get_param_check == FI_SUCCESS) { + if (env_var_threshold <= OPX_HMEM_DEV_REG_THRESHOLD_MAX) { + opx_domain->hmem_domain->devreg_copy_to_threshold = env_var_threshold; + } else { + FI_WARN(fi_opx_global.prov, FI_LOG_DOMAIN, + "FI_OPX_DEV_REG_RECV_THRESHOLD must be an integer >= %u and <= %u. Using default value (%u) instead of %zu\n", + OPX_HMEM_DEV_REG_THRESHOLD_MIN, OPX_HMEM_DEV_REG_THRESHOLD_MAX, OPX_HMEM_DEV_REG_RECV_THRESHOLD_DEFAULT, env_var_threshold); + } } #endif diff --git a/prov/opx/src/fi_opx_ep.c b/prov/opx/src/fi_opx_ep.c index 57cc0252b25..4dd5df8d50a 100644 --- a/prov/opx/src/fi_opx_ep.c +++ b/prov/opx/src/fi_opx_ep.c @@ -859,6 +859,25 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, opx_ep->tx->pio_scb_first = hfi->info.pio.scb_first; opx_ep->tx->pio_credits_addr = hfi->info.pio.credits_addr; + // Retrieve the parameter for RZV min message length + int l_rzv_min_payload_bytes; + ssize_t rc = fi_param_get_int(fi_opx_global.prov, "rzv_min_payload_bytes", &l_rzv_min_payload_bytes); + if (rc != FI_SUCCESS) { + l_rzv_min_payload_bytes = OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT; + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_RZV_MIN_PAYLOAD_BYTES not set. Using default setting of %d\n", + l_rzv_min_payload_bytes); + } else if (l_rzv_min_payload_bytes < OPX_RZV_MIN_PAYLOAD_BYTES_MIN || + l_rzv_min_payload_bytes > OPX_RZV_MIN_PAYLOAD_BYTES_MAX) { + l_rzv_min_payload_bytes = OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Error: FI_OPX_RZV_MIN_PAYLOAD_BYTES was set but is outside min/max thresholds (%d-%d). Using default setting of %d\n", + OPX_RZV_MIN_PAYLOAD_BYTES_MIN, OPX_RZV_MIN_PAYLOAD_BYTES_MAX, l_rzv_min_payload_bytes); + } else { + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_RZV_MIN_PAYLOAD_BYTES was specified. Set to %d\n", + l_rzv_min_payload_bytes); + } + opx_ep->tx->rzv_min_payload_bytes = l_rzv_min_payload_bytes; + /* Now that we know how many PIO Tx send credits we have, calculate the threshold to switch from EAGER send to RTS/CTS * With max credits, there should be enough PIO Eager buffer to send 1 full-size message and 1 credit leftover for min reliablity. */ @@ -892,22 +911,22 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "Set pio_flow_eager_tx_bytes to %d \n", opx_ep->tx->pio_flow_eager_tx_bytes); // Set the multi-packet eager max message length - int l_mp_eager_max_payload_bytes; - ssize_t rc = fi_param_get_int(fi_opx_global.prov, "mp_eager_max_payload_bytes", &l_mp_eager_max_payload_bytes); - if (rc != FI_SUCCESS) { - opx_ep->tx->mp_eager_max_payload_bytes = OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT; - OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_MP_EAGER_MAX_PAYLOAD_BYTES not set. Using default setting of %d\n", - opx_ep->tx->mp_eager_max_payload_bytes); - } else if (l_mp_eager_max_payload_bytes < opx_ep->tx->pio_flow_eager_tx_bytes || l_mp_eager_max_payload_bytes > OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX) { - opx_ep->tx->mp_eager_max_payload_bytes = OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT; - FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Error: FI_OPX_MP_EAGER_MAX_PAYLOAD_BYTES was set but is outside min/max thresholds (%d-%d). Using default setting of %d\n", - opx_ep->tx->pio_flow_eager_tx_bytes, OPX_MP_EGR_MAX_PAYLOAD_BYTES_MAX, opx_ep->tx->mp_eager_max_payload_bytes); + int l_mp_eager_disable; + if (fi_param_get_bool(fi_opx_global.prov, "mp_eager_disable", &l_mp_eager_disable) != FI_SUCCESS) { + l_mp_eager_disable = OPX_MP_EGR_DISABLE_DEFAULT; + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_MP_EAGER_DISABLE not set. Using default setting of %d\n", + l_mp_eager_disable); + } else { + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_MP_EAGER_DISABLE was specified. Set to %d\n", + l_mp_eager_disable); + } + + if (l_mp_eager_disable == OPX_MP_EGR_DISABLE_SET) { + opx_ep->tx->mp_eager_max_payload_bytes = 0; } else { - opx_ep->tx->mp_eager_max_payload_bytes = l_mp_eager_max_payload_bytes; - OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_MP_EAGER_MAX_PAYLOAD_BYTES was specified. Set to %d\n", - opx_ep->tx->mp_eager_max_payload_bytes); + opx_ep->tx->mp_eager_max_payload_bytes = l_rzv_min_payload_bytes - 1; } + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "Using MP eager threshold of %d\n", opx_ep->tx->mp_eager_max_payload_bytes); OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "Multi-packet eager chunk-size is %d.\n", FI_OPX_MP_EGR_CHUNK_SIZE); /* Set SDMA bounce buffer threshold. Any messages larger than this value in bytes will not be copied to @@ -926,8 +945,8 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, } else if (l_sdma_bounce_buf_threshold < OPX_SDMA_BOUNCE_BUF_MIN || l_sdma_bounce_buf_threshold > (OPX_SDMA_BOUNCE_BUF_MAX)) { opx_ep->tx->sdma_bounce_buf_threshold = OPX_SDMA_BOUNCE_BUF_THRESHOLD; FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, - "Error: FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD was set but is outside of MIN/MAX thresholds. Using default setting of %d\n", - opx_ep->tx->sdma_bounce_buf_threshold); + "Error: FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD was set but is outside of min/max thresholds (%d-%d). Using default setting of %d\n", + OPX_SDMA_BOUNCE_BUF_MIN, OPX_SDMA_BOUNCE_BUF_MAX, opx_ep->tx->sdma_bounce_buf_threshold); } else { opx_ep->tx->sdma_bounce_buf_threshold = l_sdma_bounce_buf_threshold; OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD was specified. Set to %d\n", @@ -951,6 +970,24 @@ static int fi_opx_ep_tx_init (struct fi_opx_ep *opx_ep, opx_ep->tx->use_sdma = 1; } + // Set the SDMA minimum message length + int l_sdma_min_payload_bytes; + rc = fi_param_get_int(fi_opx_global.prov, "sdma_min_payload_bytes", &l_sdma_min_payload_bytes); + if (rc != FI_SUCCESS) { + opx_ep->tx->sdma_min_payload_bytes = FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT; + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_MIN_PAYLOAD_BYTES not set. Using default setting of %d\n", + opx_ep->tx->sdma_min_payload_bytes); + } else if (l_sdma_min_payload_bytes < FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES || l_sdma_min_payload_bytes > INT_MAX) { + opx_ep->tx->sdma_min_payload_bytes = FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT; + FI_WARN(fi_opx_global.prov, FI_LOG_EP_DATA, + "Error: FI_OPX_SDMA_MIN_PAYLOAD_BYTES was set but is outside min/max thresholds (%d-%d). Using default setting of %d\n", + FI_OPX_HFI1_TX_MIN_RZV_PAYLOAD_BYTES, INT_MAX, opx_ep->tx->sdma_min_payload_bytes); + } else { + opx_ep->tx->sdma_min_payload_bytes = l_sdma_min_payload_bytes; + OPX_LOG_OBSERVABLE(FI_LOG_EP_DATA, "FI_OPX_SDMA_MIN_PAYLOAD_BYTES was specified. Set to %d\n", + opx_ep->tx->sdma_min_payload_bytes); + } + slist_init(&opx_ep->tx->work_pending[OPX_WORK_TYPE_SHM]); slist_init(&opx_ep->tx->work_pending[OPX_WORK_TYPE_PIO]); slist_init(&opx_ep->tx->work_pending[OPX_WORK_TYPE_SDMA]); diff --git a/prov/opx/src/fi_opx_hfi1.c b/prov/opx/src/fi_opx_hfi1.c index 4664746cdec..caa25c7b05d 100644 --- a/prov/opx/src/fi_opx_hfi1.c +++ b/prov/opx/src/fi_opx_hfi1.c @@ -3093,7 +3093,7 @@ ssize_t fi_opx_hfi1_tx_send_rzv (struct fid_ep *ep, /* Expected tid needs to send a leading data block and a trailing * data block for alignment. Limit this to SDMA (8K+) for now */ - const uint64_t immediate_block_count = (len > FI_OPX_SDMA_MIN_LENGTH && opx_ep->use_expected_tid_rzv) ? 1 : 0; + const uint64_t immediate_block_count = (len > opx_ep->tx->sdma_min_payload_bytes && opx_ep->use_expected_tid_rzv) ? 1 : 0; FI_DBG_TRACE(fi_opx_global.prov, FI_LOG_EP_DATA, "immediate_block_count %#lX *origin_byte_counter_value %#lX, origin_byte_counter_vaddr %p, " "*origin_byte_counter_vaddr %lu/%#lX, len %lu/%#lX\n", diff --git a/prov/opx/src/fi_opx_init.c b/prov/opx/src/fi_opx_init.c index 23f6da7e6d1..e8d7d0a0f8a 100644 --- a/prov/opx/src/fi_opx_init.c +++ b/prov/opx/src/fi_opx_init.c @@ -713,10 +713,12 @@ OPX_INI fi_param_define(&fi_opx_provider, "reliability_service_pre_ack_rate", FI_PARAM_INT, "The number of packets to receive from a particular sender before preemptively acknowledging them without waiting for a ping. Valid values are powers of 2 in the range of 0-32,768, where 0 indicates no preemptive acking. Defaults to 64."); fi_param_define(&fi_opx_provider, "selinux", FI_PARAM_BOOL, "Set to true if you're running a security-enhanced Linux. This enables updating the Jkey used based on system settings. Defaults to \"No\""); fi_param_define(&fi_opx_provider, "hfi_select", FI_PARAM_STRING, "Overrides the normal algorithm used to choose which HFI a process will use. See the documentation for more information."); - fi_param_define(&fi_opx_provider, "mp_eager_max_payload_bytes", FI_PARAM_INT, "Max message length in bytes for a tx to use multi-packet eager. Messages above this size will use rendezvous. Default is %d\n", OPX_MP_EGR_MAX_PAYLOAD_BYTES_DEFAULT); + fi_param_define(&fi_opx_provider, "mp_eager_disable", FI_PARAM_BOOL, "Disables tx multi-packet eager use. Defaults to %d\n", OPX_MP_EGR_DISABLE_DEFAULT); + fi_param_define(&fi_opx_provider, "rzv_min_payload_bytes", FI_PARAM_INT, "The minimum length in bytes where rendezvous will be used. For messages smaller than this threshold, the send will first try to be completed using eager or multi-packet eager. Defaults to %d\n", OPX_RZV_MIN_PAYLOAD_BYTES_DEFAULT); fi_param_define(&fi_opx_provider, "delivery_completion_threshold", FI_PARAM_INT, "Will be deprecated. Please use FI_OPX_SDMA_BOUNCE_BUF_THRESHOLD"); fi_param_define(&fi_opx_provider, "sdma_bounce_buf_threshold", FI_PARAM_INT, "The maximum message length in bytes that will be copied to the SDMA bounce buffer. For messages larger than this threshold, the send will not be completed until receiver has ACKed. Value must be between %d and %d. Defaults to %d.", OPX_SDMA_BOUNCE_BUF_MIN, OPX_SDMA_BOUNCE_BUF_MAX, OPX_SDMA_BOUNCE_BUF_THRESHOLD); fi_param_define(&fi_opx_provider, "sdma_disable", FI_PARAM_INT, "Disables SDMA offload hardware. Default is 0"); + fi_param_define(&fi_opx_provider, "sdma_min_payload_bytes", FI_PARAM_INT, "The minimum message length in bytes where SDMA will be used. For messages smaller than this threshold, the send will be completed using PIO. Value must be between %d and %d. Defaults to %d.", FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MIN, FI_OPX_SDMA_MIN_PAYLOAD_BYTES_MAX, FI_OPX_SDMA_MIN_PAYLOAD_BYTES_DEFAULT); fi_param_define(&fi_opx_provider, "expected_receive_enable", FI_PARAM_BOOL, "Enables expected receive rendezvous using Token ID (TID). Defaults to \"No\"."); fi_param_define(&fi_opx_provider, "prog_affinity", FI_PARAM_STRING, "When set, specify the set of CPU cores to set the progress "