-
Notifications
You must be signed in to change notification settings - Fork 434
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
UCT/IB/MD: retry memory registration with reduced access flags on failure #10341
base: master
Are you sure you want to change the base?
Changes from 4 commits
e47f0ff
0ed3576
5b5a4ba
ebd6386
e2564b6
bcf4d11
7f0ad47
0be06fc
547432e
f45911c
fdaa23a
a25d39b
251cd9a
2dc3f68
26e991e
bc3cf3c
d2b1b60
8aa1b4e
0ad3eb4
22ec2c2
256bd62
c6f8cbe
e3aa7ed
a6a700e
bae0a11
399eb3a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ Alex Margolin <[email protected]> | |
Alex Mikheev <[email protected]> | ||
Alexey Rivkin <[email protected]> | ||
Alina Sklarevich <[email protected]> | ||
Alma Mastbaum <[email protected]> | ||
Anatoly Vildemanov <[email protected]> | ||
Andrey Maslennikov <[email protected]> | ||
Artem Polyakov <[email protected]> | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -309,7 +309,7 @@ void *uct_ib_md_mem_handle_thread_func(void *arg) | |
while (ctx->length > 0) { | ||
if (ctx->params != NULL) { | ||
status = uct_ib_reg_mr(ctx->md, ctx->address, length, ctx->params, | ||
ctx->access_flags, NULL, &ctx->mrs[mr_idx]); | ||
ctx->access_flags, NULL, &ctx->mrs[mr_idx], 1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why hide_errors is true here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @yosefe Oh, I actually wanted to put here a value that doesn't change the previous behavior and put |
||
if (status != UCS_OK) { | ||
goto err_dereg; | ||
} | ||
|
@@ -460,7 +460,7 @@ uct_ib_md_handle_mr_list_mt(uct_ib_md_t *md, void *address, size_t length, | |
ucs_status_t uct_ib_reg_mr(uct_ib_md_t *md, void *address, size_t length, | ||
const uct_md_mem_reg_params_t *params, | ||
uint64_t access_flags, struct ibv_dm *dm, | ||
struct ibv_mr **mr_p) | ||
struct ibv_mr **mr_p, int first_attempt) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
{ | ||
ucs_time_t UCS_V_UNUSED start_time = ucs_get_time(); | ||
unsigned long retry = 0; | ||
|
@@ -507,7 +507,8 @@ ucs_status_t uct_ib_reg_mr(uct_ib_md_t *md, void *address, size_t length, | |
if (mr == NULL) { | ||
uct_ib_md_print_mem_reg_err_msg(title, address, length, access_flags, | ||
errno, | ||
flags & UCT_MD_MEM_FLAG_HIDE_ERRORS); | ||
(flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) || | ||
first_attempt); | ||
return UCS_ERR_IO_ERROR; | ||
} | ||
|
||
|
@@ -626,9 +627,39 @@ ucs_status_t uct_ib_memh_alloc(uct_ib_md_t *md, size_t length, | |
return UCS_OK; | ||
} | ||
|
||
uint64_t uct_ib_memh_access_flags(uct_ib_mem_t *memh, int relaxed_order) | ||
uint64_t uct_flags_to_ibv_mem_access_flags(uint64_t uct_flags) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
{ | ||
uint64_t access_flags = UCT_IB_MEM_ACCESS_FLAGS; | ||
uint64_t ibv_flags = 0; | ||
|
||
if (uct_flags & UCT_MD_MEM_ACCESS_LOCAL_WRITE) { | ||
ibv_flags |= IBV_ACCESS_LOCAL_WRITE; | ||
} | ||
|
||
if (uct_flags & UCT_MD_MEM_ACCESS_REMOTE_GET) { | ||
ibv_flags |= IBV_ACCESS_REMOTE_READ; | ||
} | ||
|
||
if (uct_flags & UCT_MD_MEM_ACCESS_REMOTE_PUT) { | ||
ibv_flags |= IBV_ACCESS_REMOTE_WRITE; | ||
} | ||
|
||
if (uct_flags & UCT_MD_MEM_ACCESS_REMOTE_ATOMIC) { | ||
ibv_flags |= IBV_ACCESS_REMOTE_ATOMIC; | ||
} | ||
|
||
return ibv_flags; | ||
} | ||
|
||
uint64_t uct_ib_memh_access_flags(uct_ib_mem_t *memh, int relaxed_order, | ||
int first_attempt, uint64_t uct_flags) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here as well, I'd rename |
||
{ | ||
uint64_t access_flags; | ||
|
||
if (first_attempt) { | ||
access_flags = UCT_IB_MEM_ACCESS_FLAGS; | ||
} else { | ||
access_flags = uct_flags_to_ibv_mem_access_flags(uct_flags); | ||
} | ||
|
||
if (memh->flags & UCT_IB_MEM_FLAG_ODP) { | ||
access_flags |= IBV_ACCESS_ON_DEMAND; | ||
|
@@ -645,26 +676,33 @@ ucs_status_t uct_ib_verbs_mem_reg(uct_md_h uct_md, void *address, size_t length, | |
const uct_md_mem_reg_params_t *params, | ||
uct_mem_h *memh_p) | ||
{ | ||
uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); | ||
uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t); | ||
uint64_t uct_flags = UCT_MD_MEM_REG_FIELD_VALUE(params, flags, | ||
FIELD_FLAGS, 0); | ||
int attempt_cnt = 0; | ||
struct ibv_mr *mr_default; | ||
uct_ib_verbs_mem_t *memh; | ||
uct_ib_mem_t *ib_memh; | ||
uint64_t access_flags; | ||
ucs_status_t status; | ||
|
||
status = uct_ib_memh_alloc(md, length, | ||
UCT_MD_MEM_REG_FIELD_VALUE(params, flags, | ||
FIELD_FLAGS, 0), | ||
status = uct_ib_memh_alloc(md, length, uct_flags, | ||
sizeof(*memh), sizeof(memh->mrs[0]), &ib_memh); | ||
if (status != UCS_OK) { | ||
goto err; | ||
} | ||
|
||
memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t); | ||
access_flags = uct_ib_memh_access_flags(&memh->super, md->relaxed_order); | ||
memh = ucs_derived_of(ib_memh, uct_ib_verbs_mem_t); | ||
|
||
do { | ||
access_flags = uct_ib_memh_access_flags(&memh->super, md->relaxed_order, | ||
attempt_cnt == 0, uct_flags); | ||
|
||
status = uct_ib_reg_mr(md, address, length, params, access_flags, NULL, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Align |
||
&mr_default, attempt_cnt == 0); | ||
attempt_cnt++; | ||
} while ((status != UCS_OK) && (attempt_cnt < 2)); | ||
|
||
status = uct_ib_reg_mr(md, address, length, params, access_flags, NULL, | ||
&mr_default); | ||
if (status != UCS_OK) { | ||
goto err_free; | ||
} | ||
|
@@ -676,7 +714,7 @@ ucs_status_t uct_ib_verbs_mem_reg(uct_md_h uct_md, void *address, size_t length, | |
if (md->relaxed_order) { | ||
status = uct_ib_reg_mr(md, address, length, params, | ||
access_flags & ~IBV_ACCESS_RELAXED_ORDERING, | ||
NULL, &memh->mrs[UCT_IB_MR_STRICT_ORDER].ib); | ||
NULL, &memh->mrs[UCT_IB_MR_STRICT_ORDER].ib, 1); | ||
if (status != UCS_OK) { | ||
goto err_dereg_default; | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -703,11 +703,13 @@ uct_ib_mlx5_devx_reg_mr(uct_ib_mlx5_md_t *md, uct_ib_mlx5_devx_mem_t *memh, | |
uct_ib_mr_type_t mr_type, uint64_t access_mask, | ||
uint32_t *lkey_p, uint32_t *rkey_p) | ||
{ | ||
uint64_t access_flags = uct_ib_memh_access_flags(&memh->super, | ||
md->super.relaxed_order) & | ||
access_mask; | ||
unsigned flags = UCT_MD_MEM_REG_FIELD_VALUE(params, flags, | ||
FIELD_FLAGS, 0); | ||
uint64_t access_flags = uct_ib_memh_access_flags(&memh->super, | ||
md->super.relaxed_order, | ||
1, flags) & | ||
access_mask; | ||
int attempt_cnt = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid abbreviations ( |
||
ucs_status_t status; | ||
uint32_t mkey; | ||
|
||
|
@@ -731,8 +733,16 @@ uct_ib_mlx5_devx_reg_mr(uct_ib_mlx5_md_t *md, uct_ib_mlx5_devx_mem_t *memh, | |
/* Fallback if multi-thread registration is unsupported */ | ||
} | ||
|
||
status = uct_ib_reg_mr(&md->super, address, length, params, access_flags, | ||
NULL, &memh->mrs[mr_type].super.ib); | ||
do { | ||
status = uct_ib_reg_mr(&md->super, address, length, params, access_flags, | ||
NULL, &memh->mrs[mr_type].super.ib, | ||
attempt_cnt == 0); | ||
attempt_cnt++; | ||
access_flags = uct_ib_memh_access_flags(&memh->super, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. First There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it's intentional, though it might be written better.. |
||
md->super.relaxed_order, | ||
attempt_cnt == 0, flags); | ||
} while ((status != UCS_OK) && (attempt_cnt < 2)); | ||
|
||
if (status != UCS_OK) { | ||
return status; | ||
} | ||
|
@@ -792,6 +802,7 @@ uct_ib_mlx5_devx_mem_reg_gva(uct_md_h uct_md, unsigned flags, uct_mem_h *memh_p) | |
{ | ||
uct_ib_mlx5_md_t *md = ucs_derived_of(uct_md, uct_ib_mlx5_md_t); | ||
uct_md_mem_reg_params_t params = {}; | ||
int attempt_cnt = 0; | ||
uct_ib_mlx5_devx_mem_t *memh; | ||
uint64_t access_flags; | ||
ucs_status_t status; | ||
|
@@ -805,9 +816,17 @@ uct_ib_mlx5_devx_mem_reg_gva(uct_md_h uct_md, unsigned flags, uct_mem_h *memh_p) | |
} | ||
|
||
relaxed_order = md->flags & UCT_IB_MLX5_MD_FLAG_GVA_RO; | ||
access_flags = uct_ib_memh_access_flags(&memh->super, relaxed_order); | ||
status = uct_ib_reg_mr(&md->super, NULL, SIZE_MAX, ¶ms, access_flags, | ||
NULL, &memh->mrs[UCT_IB_MR_DEFAULT].super.ib); | ||
|
||
do { | ||
access_flags = uct_ib_memh_access_flags(&memh->super, relaxed_order, | ||
attempt_cnt == 0, flags); | ||
status = uct_ib_reg_mr(&md->super, NULL, SIZE_MAX, ¶ms, | ||
access_flags, NULL, | ||
&memh->mrs[UCT_IB_MR_DEFAULT].super.ib, | ||
attempt_cnt == 0); | ||
attempt_cnt++; | ||
} while ((status != UCS_OK) && (attempt_cnt < 2)); | ||
|
||
if (status != UCS_OK) { | ||
goto err_reg; | ||
} | ||
|
@@ -816,7 +835,7 @@ uct_ib_mlx5_devx_mem_reg_gva(uct_md_h uct_md, unsigned flags, uct_mem_h *memh_p) | |
status = uct_ib_reg_mr(&md->super, NULL, SIZE_MAX, ¶ms, | ||
access_flags & ~IBV_ACCESS_RELAXED_ORDERING, | ||
NULL, | ||
&memh->mrs[UCT_IB_MR_STRICT_ORDER].super.ib); | ||
&memh->mrs[UCT_IB_MR_STRICT_ORDER].super.ib, 1); | ||
if (status != UCS_OK) { | ||
goto err_dereg_default; | ||
} | ||
|
@@ -1826,7 +1845,7 @@ static void uct_ib_mlx5_devx_init_flush_mr(uct_ib_mlx5_md_t *md) | |
|
||
status = uct_ib_reg_mr(&md->super, md->zero_buf, | ||
UCT_IB_MD_FLUSH_REMOTE_LENGTH, ¶ms, | ||
UCT_IB_MEM_ACCESS_FLAGS, NULL, &md->flush_mr); | ||
UCT_IB_MEM_ACCESS_FLAGS, NULL, &md->flush_mr, 1); | ||
if (status != UCS_OK) { | ||
goto err; | ||
} | ||
|
@@ -2027,7 +2046,7 @@ uct_ib_mlx5_devx_device_mem_alloc(uct_md_h uct_md, size_t *length_p, | |
reg_params.field_mask = 0; | ||
status = uct_ib_reg_mr(md, address, dm_attr.length, ®_params, | ||
UCT_IB_MEM_ACCESS_FLAGS, dm, | ||
&memh->mrs[UCT_IB_MR_DEFAULT].super.ib); | ||
&memh->mrs[UCT_IB_MR_DEFAULT].super.ib, 1); | ||
if (status != UCS_OK) { | ||
goto err_munmap_address; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's just the most basic option so I thought I can keep mention it as well, but on the other hand it doesn't matter as local read is enabled by default in ibv_reg_mr (there is no such flag in verbs.h because it's always enabled)