From ecb7d87e8e8166610c67c05e62814dbe83a2ea69 Mon Sep 17 00:00:00 2001 From: Roie Danino Date: Sun, 12 Jan 2025 13:37:57 +0000 Subject: [PATCH 1/5] UCT/IB/MLX5/RC: perf tuning - decrease rc latency estimation --- src/uct/ib/mlx5/rc/rc_mlx5_iface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uct/ib/mlx5/rc/rc_mlx5_iface.c b/src/uct/ib/mlx5/rc/rc_mlx5_iface.c index 078d24149bb..72dc9c2ac31 100644 --- a/src/uct/ib/mlx5/rc/rc_mlx5_iface.c +++ b/src/uct/ib/mlx5/rc/rc_mlx5_iface.c @@ -188,7 +188,7 @@ static ucs_status_t uct_rc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr uct_rc_mlx5_iface_common_query(&rc_iface->super, iface_attr, max_am_inline, UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(0)); iface_attr->cap.flags |= UCT_IFACE_FLAG_EP_CHECK; - iface_attr->latency.m += 1e-9; /* 1 ns per each extra QP */ + iface_attr->latency.m += 32e-11; /* 0.32 ns per each extra QP */ iface_attr->ep_addr_len = ep_addr_len; iface_attr->iface_addr_len = sizeof(uint8_t); return UCS_OK; From 17db3af35b7dc22ef30eef9562b8ab0e3e7aa3ee Mon Sep 17 00:00:00 2001 From: Roie Danino Date: Mon, 13 Jan 2025 15:46:08 +0000 Subject: [PATCH 2/5] UCT/IB/MLX5/DC: added latency to DC when AR enabled instead of decreasing RC, added env for it --- src/uct/ib/mlx5/dc/dc_mlx5.c | 12 ++++++++++++ src/uct/ib/mlx5/dc/dc_mlx5.h | 3 +++ src/uct/ib/mlx5/rc/rc_mlx5_iface.c | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/uct/ib/mlx5/dc/dc_mlx5.c b/src/uct/ib/mlx5/dc/dc_mlx5.c index 9bd70dae7b2..a9209999bea 100644 --- a/src/uct/ib/mlx5/dc/dc_mlx5.c +++ b/src/uct/ib/mlx5/dc/dc_mlx5.c @@ -140,6 +140,12 @@ ucs_config_field_t uct_dc_mlx5_iface_config_sub_table[] = { ucs_offsetof(uct_dc_mlx5_iface_config_t, dcis_initial_capacity), UCS_CONFIG_TYPE_UINT}, + {"FULL_HANDSHAKE_ADDED_LATENCY", "110ns", + "Amount of latency added to performance estimation of DC due to full handshake " + "(used when AR is enabled).", + ucs_offsetof(uct_dc_mlx5_iface_config_t, fhs_added_latency), + UCS_CONFIG_TYPE_TIME_UNITS}, + {NULL} }; @@ -242,6 +248,11 @@ static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr sizeof(uct_dc_mlx5_iface_addr_t); iface_attr->latency.c += 60e-9; /* connect packet + cqe */ + /* Full handshake is used when AR is enabled */ + if (iface->super.config.dp_ordering == UCT_IB_MLX5_DP_ORDERING_OOO_RW) { + iface_attr->latency.c += ucs_time_to_sec(iface->tx.fhs_added_latency); + } + uct_rc_mlx5_iface_common_query(&iface->super.super.super, iface_attr, max_am_inline, UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(UCT_IB_MLX5_AV_FULL_SIZE)); @@ -1662,6 +1673,7 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h wor self->tx.fc_hard_req_progress_cb_id = UCS_CALLBACKQ_ID_NULL; self->tx.num_dci_pools = 0; self->flags = 0; + self->tx.fhs_added_latency = config->fhs_added_latency; self->tx.av_fl_mlid = self->super.super.super.path_bits[0] & 0x7f; kh_init_inplace(uct_dc_mlx5_fc_hash, &self->tx.fc_hash); diff --git a/src/uct/ib/mlx5/dc/dc_mlx5.h b/src/uct/ib/mlx5/dc/dc_mlx5.h index a2398832a32..48f5e418d0a 100644 --- a/src/uct/ib/mlx5/dc/dc_mlx5.h +++ b/src/uct/ib/mlx5/dc/dc_mlx5.h @@ -187,6 +187,7 @@ typedef struct uct_dc_mlx5_iface_config { uct_ud_mlx5_iface_common_config_t mlx5_ud; unsigned num_dci_channels; unsigned dcis_initial_capacity; + ucs_time_t fhs_added_latency; } uct_dc_mlx5_iface_config_t; @@ -345,6 +346,8 @@ struct uct_dc_mlx5_iface { /* used in hybrid dcs policy otherwise -1 */ uint16_t hybrid_hw_dci; + + ucs_time_t fhs_added_latency; } tx; struct { diff --git a/src/uct/ib/mlx5/rc/rc_mlx5_iface.c b/src/uct/ib/mlx5/rc/rc_mlx5_iface.c index 72dc9c2ac31..078d24149bb 100644 --- a/src/uct/ib/mlx5/rc/rc_mlx5_iface.c +++ b/src/uct/ib/mlx5/rc/rc_mlx5_iface.c @@ -188,7 +188,7 @@ static ucs_status_t uct_rc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr uct_rc_mlx5_iface_common_query(&rc_iface->super, iface_attr, max_am_inline, UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(0)); iface_attr->cap.flags |= UCT_IFACE_FLAG_EP_CHECK; - iface_attr->latency.m += 32e-11; /* 0.32 ns per each extra QP */ + iface_attr->latency.m += 1e-9; /* 1 ns per each extra QP */ iface_attr->ep_addr_len = ep_addr_len; iface_attr->iface_addr_len = sizeof(uint8_t); return UCS_OK; From d4c62222cc135c2222e7c826614f9cdd0ccd72e2 Mon Sep 17 00:00:00 2001 From: Roie Danino Date: Wed, 15 Jan 2025 12:44:22 +0000 Subject: [PATCH 3/5] UCT/IB/MLX5/DC: check ooo_sl_mask and user-forced full handshake --- src/uct/ib/mlx5/dc/dc_mlx5.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/uct/ib/mlx5/dc/dc_mlx5.c b/src/uct/ib/mlx5/dc/dc_mlx5.c index a9209999bea..d345facfb1e 100644 --- a/src/uct/ib/mlx5/dc/dc_mlx5.c +++ b/src/uct/ib/mlx5/dc/dc_mlx5.c @@ -214,8 +214,10 @@ uct_dc_mlx5_ep_create_connected(const uct_ep_params_t *params, uct_ep_h* ep_p) static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_mlx5_iface_t); + uct_ib_mlx5_md_t *md = uct_ib_mlx5_iface_md(&iface->super.super.super); size_t max_am_inline = UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE); size_t max_put_inline = UCT_IB_MLX5_PUT_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE); + uint16_t ooo_sl_mask = 0; ucs_status_t status; #if HAVE_IBV_DM @@ -248,8 +250,18 @@ static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr sizeof(uct_dc_mlx5_iface_addr_t); iface_attr->latency.c += 60e-9; /* connect packet + cqe */ +#if HAVE_DEVX + status = uct_ib_mlx5_devx_query_ooo_sl_mask( + md, iface->super.super.super.config.port_num, &ooo_sl_mask); + if ((status != UCS_OK) && (status != UCS_ERR_UNSUPPORTED)) { + return status; + } +#endif + /* Full handshake is used when AR is enabled */ - if (iface->super.config.dp_ordering == UCT_IB_MLX5_DP_ORDERING_OOO_RW) { + if ((iface->super.config.dp_ordering == UCT_IB_MLX5_DP_ORDERING_OOO_RW) || + (iface->flags & UCT_DC_MLX5_IFACE_FLAG_DCI_FULL_HANDSHAKE) || + (UCS_BIT(iface->super.super.super.config.sl) & ooo_sl_mask)) { iface_attr->latency.c += ucs_time_to_sec(iface->tx.fhs_added_latency); } From 2d1649dfafbe5e429d90c86121f0458da5ae56fc Mon Sep 17 00:00:00 2001 From: Roie Danino Date: Wed, 15 Jan 2025 14:05:48 +0000 Subject: [PATCH 4/5] UCT/IB/MLX5/DC: DC latency should be increased also when DDP is enabled --- src/uct/ib/mlx5/dc/dc_mlx5.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/uct/ib/mlx5/dc/dc_mlx5.c b/src/uct/ib/mlx5/dc/dc_mlx5.c index d345facfb1e..8ebd9df6e48 100644 --- a/src/uct/ib/mlx5/dc/dc_mlx5.c +++ b/src/uct/ib/mlx5/dc/dc_mlx5.c @@ -258,8 +258,10 @@ static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr } #endif - /* Full handshake is used when AR is enabled */ - if ((iface->super.config.dp_ordering == UCT_IB_MLX5_DP_ORDERING_OOO_RW) || + /* Full handshake is used when AR / DDP is enabled + * or when the user explicitly forces it + */ + if ((iface->super.config.dp_ordering >= UCT_IB_MLX5_DP_ORDERING_OOO_RW) || (iface->flags & UCT_DC_MLX5_IFACE_FLAG_DCI_FULL_HANDSHAKE) || (UCS_BIT(iface->super.super.super.config.sl) & ooo_sl_mask)) { iface_attr->latency.c += ucs_time_to_sec(iface->tx.fhs_added_latency); From 7201aa44e651298962c9e4524de2a9a0a55c52f9 Mon Sep 17 00:00:00 2001 From: Roie Danino Date: Sun, 19 Jan 2025 15:42:40 +0000 Subject: [PATCH 5/5] UCT/IB/MLX5/DC: fixed unused md variable --- src/uct/ib/mlx5/dc/dc_mlx5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/uct/ib/mlx5/dc/dc_mlx5.c b/src/uct/ib/mlx5/dc/dc_mlx5.c index 8ebd9df6e48..fd7b0d60361 100644 --- a/src/uct/ib/mlx5/dc/dc_mlx5.c +++ b/src/uct/ib/mlx5/dc/dc_mlx5.c @@ -214,10 +214,10 @@ uct_dc_mlx5_ep_create_connected(const uct_ep_params_t *params, uct_ep_h* ep_p) static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr_t *iface_attr) { uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_mlx5_iface_t); - uct_ib_mlx5_md_t *md = uct_ib_mlx5_iface_md(&iface->super.super.super); size_t max_am_inline = UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE); size_t max_put_inline = UCT_IB_MLX5_PUT_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE); uint16_t ooo_sl_mask = 0; + uct_ib_mlx5_md_t UCS_V_UNUSED *md; ucs_status_t status; #if HAVE_IBV_DM @@ -251,6 +251,7 @@ static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr iface_attr->latency.c += 60e-9; /* connect packet + cqe */ #if HAVE_DEVX + md = uct_ib_mlx5_iface_md(&iface->super.super.super); status = uct_ib_mlx5_devx_query_ooo_sl_mask( md, iface->super.super.super.config.port_num, &ooo_sl_mask); if ((status != UCS_OK) && (status != UCS_ERR_UNSUPPORTED)) {