-
Notifications
You must be signed in to change notification settings - Fork 435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
UCT/IB/MLX5/RC: perf tuning - increase dc latency estimation when AR is enabled #10415
base: master
Are you sure you want to change the base?
Changes from all commits
ecb7d87
17db3af
d4c6222
2d1649d
7201aa4
76c1394
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -140,6 +140,12 @@ ucs_config_field_t uct_dc_mlx5_iface_config_sub_table[] = { | |
ucs_offsetof(uct_dc_mlx5_iface_config_t, dcis_initial_capacity), | ||
UCS_CONFIG_TYPE_ULUNITS}, | ||
|
||
{"FULL_HANDSHAKE_ADDED_LATENCY", "110ns", | ||
"Amount of latency added to performance estimation of DC due to full handshake " | ||
"(used when AR is enabled).", | ||
ucs_offsetof(uct_dc_mlx5_iface_config_t, fhs_added_latency), | ||
UCS_CONFIG_TYPE_TIME_UNITS}, | ||
|
||
{NULL} | ||
}; | ||
|
||
|
@@ -210,6 +216,8 @@ static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr | |
uct_dc_mlx5_iface_t *iface = ucs_derived_of(tl_iface, uct_dc_mlx5_iface_t); | ||
size_t max_am_inline = UCT_IB_MLX5_AM_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE); | ||
size_t max_put_inline = UCT_IB_MLX5_PUT_MAX_SHORT(UCT_IB_MLX5_AV_FULL_SIZE); | ||
uint16_t ooo_sl_mask = 0; | ||
uct_ib_mlx5_md_t UCS_V_UNUSED *md; | ||
ucs_status_t status; | ||
|
||
#if HAVE_IBV_DM | ||
|
@@ -242,6 +250,24 @@ static ucs_status_t uct_dc_mlx5_iface_query(uct_iface_h tl_iface, uct_iface_attr | |
sizeof(uct_dc_mlx5_iface_addr_t); | ||
iface_attr->latency.c += 60e-9; /* connect packet + cqe */ | ||
|
||
#if HAVE_DEVX | ||
md = uct_ib_mlx5_iface_md(&iface->super.super.super); | ||
status = uct_ib_mlx5_devx_query_ooo_sl_mask( | ||
md, iface->super.super.super.config.port_num, &ooo_sl_mask); | ||
if ((status != UCS_OK) && (status != UCS_ERR_UNSUPPORTED)) { | ||
return status; | ||
} | ||
#endif | ||
|
||
/* Full handshake is used when AR / DDP is enabled | ||
* or when the user explicitly forces it | ||
*/ | ||
if ((iface->super.config.dp_ordering >= UCT_IB_MLX5_DP_ORDERING_OOO_RW) || | ||
(iface->flags & UCT_DC_MLX5_IFACE_FLAG_DCI_FULL_HANDSHAKE) || | ||
(UCS_BIT(iface->super.super.super.config.sl) & ooo_sl_mask)) { | ||
iface_attr->latency.c += ucs_time_to_sec(iface->tx.fhs_added_latency); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should do it also when user forces full handshake or using SL with adaptive routing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't that mean creating a dummy dci? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm right, so better avoid it |
||
} | ||
|
||
uct_rc_mlx5_iface_common_query(&iface->super.super.super, iface_attr, | ||
max_am_inline, | ||
UCT_RC_MLX5_TM_EAGER_ZCOPY_MAX_IOV(UCT_IB_MLX5_AV_FULL_SIZE)); | ||
|
@@ -1669,6 +1695,7 @@ static UCS_CLASS_INIT_FUNC(uct_dc_mlx5_iface_t, uct_md_h tl_md, uct_worker_h wor | |
self->tx.fc_hard_req_progress_cb_id = UCS_CALLBACKQ_ID_NULL; | ||
self->tx.num_dci_pools = 0; | ||
self->flags = 0; | ||
self->tx.fhs_added_latency = config->fhs_added_latency; | ||
self->tx.av_fl_mlid = self->super.super.super.path_bits[0] & 0x7f; | ||
|
||
kh_init_inplace(uct_dc_mlx5_fc_hash, &self->tx.fc_hash); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shouldn't we double (or even 3x) the latency since there is another round trip?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In general it adds 2.7us (which is ~x30) but I tried to set a number that makes UCX select DC only when it improves latency and bandwidth compared to RC (BTW setting bigger values eventually makes UCX select UD over DC unless we increase estimated UD latency)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean double the current latency estimation. Since FHS adds a round trip for connection establishment.
Does UD really have a better performance than DC FHS?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, UD Latency is much better than DC FHS, 1.62us vs 4.27us in small sizes