Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mlx5: Introduce data direct placement (DDP) over the DV API #1494

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions kernel-headers/rdma/bnxt_re-abi.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,14 @@ struct bnxt_re_srq_req {
__aligned_u64 srq_handle;
};

enum bnxt_re_srq_mask {
BNXT_RE_SRQ_TOGGLE_PAGE_SUPPORT = 0x1,
};

struct bnxt_re_srq_resp {
__u32 srqid;
__u32 rsvd; /* padding */
__aligned_u64 comp_mask;
};

enum bnxt_re_shpg_offt {
Expand Down
5 changes: 5 additions & 0 deletions kernel-headers/rdma/mlx5-abi.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ enum mlx5_ib_query_dev_resp_flags {
MLX5_IB_QUERY_DEV_RESP_FLAGS_CQE_128B_PAD = 1 << 1,
MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE = 1 << 2,
MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT = 1 << 3,
MLX5_IB_QUERY_DEV_RESP_FLAGS_OOO_DP = 1 << 4,
};

enum mlx5_ib_tunnel_offloads {
Expand Down Expand Up @@ -439,6 +440,10 @@ struct mlx5_ib_burst_info {
__u16 reserved;
};

enum mlx5_ib_modify_qp_mask {
MLX5_IB_MODIFY_QP_OOO_DP = 1 << 0,
};

struct mlx5_ib_modify_qp {
__u32 comp_mask;
struct mlx5_ib_burst_info burst_info;
Expand Down
13 changes: 6 additions & 7 deletions providers/mlx5/cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ static inline int handle_responder_lazy(struct mlx5_cq *cq, struct mlx5_cqe64 *c
wq = &(rsc_to_mrwq(cur_rsc)->rq);
}

wqe_ctr = wq->tail & (wq->wqe_cnt - 1);
wqe_ctr = be16toh(cqe->wqe_counter) & (wq->wqe_cnt - 1);
cq->verbs_cq.cq_ex.wr_id = wq->wrid[wqe_ctr];
++wq->tail;
if (cqe->op_own & MLX5_INLINE_SCATTER_32)
Expand Down Expand Up @@ -283,7 +283,7 @@ static inline int handle_responder(struct ibv_wc *wc, struct mlx5_cqe64 *cqe,
wq = &(rsc_to_mrwq(cur_rsc)->rq);
}

wqe_ctr = wq->tail & (wq->wqe_cnt - 1);
wqe_ctr = be16toh(cqe->wqe_counter) & (wq->wqe_cnt - 1);
wc->wr_id = wq->wrid[wqe_ctr];
++wq->tail;
if (cqe->op_own & MLX5_INLINE_SCATTER_32)
Expand Down Expand Up @@ -746,6 +746,7 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
}

opcode = mlx5dv_get_cqe_opcode(cqe64);
wqe_ctr = be16toh(cqe64->wqe_counter);
switch (opcode) {
case MLX5_CQE_REQ:
{
Expand All @@ -755,7 +756,6 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
if (unlikely(!mqp))
return CQ_POLL_ERR;
wq = &mqp->sq;
wqe_ctr = be16toh(cqe64->wqe_counter);
idx = wqe_ctr & (wq->wqe_cnt - 1);
if (lazy) {
uint32_t wc_byte_len;
Expand Down Expand Up @@ -909,7 +909,6 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
if (unlikely(!mqp))
return CQ_POLL_ERR;
wq = &mqp->sq;
wqe_ctr = be16toh(cqe64->wqe_counter);
idx = wqe_ctr & (wq->wqe_cnt - 1);
if (lazy)
cq->verbs_cq.cq_ex.wr_id = wq->wrid[idx];
Expand All @@ -923,7 +922,6 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
return CQ_POLL_ERR;

if (is_srq) {
wqe_ctr = be16toh(cqe64->wqe_counter);
if (is_odp_pfault_err(ecqe)) {
mlx5_complete_odp_fault(*cur_srq, wqe_ctr);
err = mlx5_get_next_cqe(cq, &cqe64, &cqe);
Expand All @@ -950,10 +948,11 @@ static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
break;
}

idx = wqe_ctr & (wq->wqe_cnt - 1);
if (lazy)
cq->verbs_cq.cq_ex.wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
cq->verbs_cq.cq_ex.wr_id = wq->wrid[idx];
else
wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
wc->wr_id = wq->wrid[idx];
++wq->tail;
}
}
Expand Down
64 changes: 63 additions & 1 deletion providers/mlx5/man/mlx5dv_create_qp.3.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,68 @@ struct mlx5dv_qp_init_attr {
about the signature pipelining in
**mlx5dv_qp_cancel_posted_send_wrs**(3).

MLX5DV_QP_CREATE_OOO_DP:
If the flag is set, Receive WRs on the receiver side of the QP are
allowed to be consumed out-of-order and sender side of the QP is allowed
to transmit messages without guaranteeing any arrival ordering on the
receiver side.

The flag, when set, must be set both on the sender and receiver side of
a QP (e.g., DCT and DCI).

Setting the flag is optional and the availability of this feature should
be queried by the application (See details in
**mlx5dv_query_device**(3)) and there is no automatic fallback: If the
flag is set while kernel or device does not support the feature, then
creating the QP fails. Thus, before creating a QP with this flag set,
application must query the maximal outstanding Receive WRs possible on a
QP with this flag set, according to the QP type (see details in
**mlx5dv_query_device**(3)) and make sure the capability is supported.

> **Note**
>
> All the following describe the behavior and semantics of a QP
> with this flag set.

Completions' delivery ordering:

A Receive WR posted on this QP may be consumed by any arriving message
to this QP that requires Receive WR consumption. Nonetheless, the
ordering in which work completions are delivered for the posted WRs,
both on sender side and receiver side, remains unchanged when this flag
is set (and is independent of the ordering in which the Receive WRs are
consumed). The ID delivered in every work completion (wr_id) will
specify which WR was completed by the delivered work completion.

Data placing and operations' execution ordering:

RDMA Read and RDMA Atomic operations are executed on the responder side
in order, i.e., these operations are executed after all previous
messages are done executing.
However, the ordering of RDMA Read response packets being scattered
to memory on the requestor side is not guaranteed. This means that,
although the data is read after executing all previous messages,
it may be scattered out-of-order on the requestor side.

Ordering of write requests towards the memory on the responder side,
initiated by RDMA Send, RDMA Send with Immediate, RDMA Write or RDMA
Write with Immediate is not guaranteed.

Good and bad practice:

Since it cannot be guaranteed which RDMA Send (and/or RDMA Send with
Immediate) will consume a Receive WR (and will scatter its data to the
memory buffers specified in the WR) it's not recommended to post
different sizes of Receive WRs.

Polling on any memory that is used by the device to scatter data, is not
recommended since ordering of data placement of RDMA Send, RDMA Write
and RDMA Write with Immediate is not guaranteed.

Receiver, upon getting a completion for an RDMA Write with Immediate,
should not rely on wr_id alone to determine to which memory data was
scattered by the operation.

*dc_init_attr*
: DC init attributes.

Expand Down Expand Up @@ -163,7 +225,7 @@ returns a pointer to the created QP, on error NULL will be returned and errno wi

# SEE ALSO

**ibv_query_device_ex**(3), **ibv_create_qp_ex**(3),
**ibv_query_device_ex**(3), **ibv_create_qp_ex**(3), **mlx5dv_query_device**(3)

# AUTHOR

Expand Down
14 changes: 14 additions & 0 deletions providers/mlx5/man/mlx5dv_query_device.3
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ struct mlx5dv_crypto_caps crypto_caps;
uint64_t max_dc_rd_atom; /* Maximum number of outstanding RDMA read/atomic per DC QP as a requester */
uint64_t max_dc_init_rd_atom; /* Maximum number of outstanding RDMA read/atomic per DC QP as a responder */
struct mlx5dv_reg reg_c0; /* value and mask to match local vport egress traffic in FDB */
struct mlx5dv_ooo_recv_wrs_caps ooo_recv_wrs_caps; /* Maximum number of outstanding WRs per out-of-order QP type */
.in -8
};

Expand Down Expand Up @@ -107,6 +108,7 @@ MLX5DV_CONTEXT_MASK_WR_MEMCPY_LENGTH = 1 << 12,
MLX5DV_CONTEXT_MASK_CRYPTO_OFFLOAD = 1 << 13,
MLX5DV_CONTEXT_MASK_MAX_DC_RD_ATOM = 1 << 14,
MLX5DV_CONTEXT_MASK_REG_C0 = 1 << 15,
MLX5DV_CONTEXT_MASK_OOO_RECV_WRS = 1 << 16,
.in -8
};

Expand Down Expand Up @@ -250,6 +252,18 @@ enum mlx5dv_crypto_caps_flags {
.in -8
};

.PP
.nf
struct mlx5dv_ooo_recv_wrs_caps {
.in +8
uint32_t max_rc;
uint32_t max_xrc;
uint32_t max_dct;
uint32_t max_ud;
uint32_t max_uc;
.in -8
};

.fi
.SH "RETURN VALUE"
0 on success or the value of errno on failure (which indicates the failure reason).
Expand Down
7 changes: 7 additions & 0 deletions providers/mlx5/mlx5.c
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,13 @@ static int _mlx5dv_query_device(struct ibv_context *ctx_in,
}
}

if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_OOO_RECV_WRS) {
if (mctx->vendor_cap_flags & MLX5_VENDOR_CAP_FLAGS_OOO_DP) {
attrs_out->ooo_recv_wrs_caps = mctx->ooo_recv_wrs_caps;
comp_mask_out |= MLX5DV_CONTEXT_MASK_OOO_RECV_WRS;
}
}

attrs_out->comp_mask = comp_mask_out;

return 0;
Expand Down
3 changes: 3 additions & 0 deletions providers/mlx5/mlx5.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ enum mlx5_vendor_cap_flags {
MLX5_VENDOR_CAP_FLAGS_CQE_128B_PAD = 1 << 4,
MLX5_VENDOR_CAP_FLAGS_PACKET_BASED_CREDIT_MODE = 1 << 5,
MLX5_VENDOR_CAP_FLAGS_SCAT2CQE_DCT = 1 << 6,
MLX5_VENDOR_CAP_FLAGS_OOO_DP = 1 << 7,
};

enum {
Expand Down Expand Up @@ -423,6 +424,7 @@ struct mlx5_context {
uint64_t max_dc_rd_atom;
uint64_t max_dc_init_rd_atom;
struct mlx5dv_reg reg_c0;
struct mlx5dv_ooo_recv_wrs_caps ooo_recv_wrs_caps;
};

struct mlx5_hugetlb_mem {
Expand Down Expand Up @@ -652,6 +654,7 @@ struct mlx5_mr {
enum mlx5_qp_flags {
MLX5_QP_FLAGS_USE_UNDERLAY = 0x01,
MLX5_QP_FLAGS_DRAIN_SIGERR = 0x02,
MLX5_QP_FLAGS_OOO_DP = 1 << 2,
};

struct mlx5_qp {
Expand Down
7 changes: 6 additions & 1 deletion providers/mlx5/mlx5_ifc.h
Original file line number Diff line number Diff line change
Expand Up @@ -1114,7 +1114,12 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 log_max_transport_domain[0x5];
u8 reserved_at_328[0x3];
u8 log_max_pd[0x5];
u8 reserved_at_330[0xb];
u8 dp_ordering_ooo_all_ud[0x1];
u8 dp_ordering_ooo_all_uc[0x1];
u8 dp_ordering_ooo_all_xrc[0x1];
u8 dp_ordering_ooo_all_dc[0x1];
u8 dp_ordering_ooo_all_rc[0x1];
u8 reserved_at_335[0x6];
u8 log_max_xrcd[0x5];

u8 nic_receive_steering_discard[0x1];
Expand Down
11 changes: 11 additions & 0 deletions providers/mlx5/mlx5dv.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ enum mlx5dv_context_comp_mask {
MLX5DV_CONTEXT_MASK_CRYPTO_OFFLOAD = 1 << 13,
MLX5DV_CONTEXT_MASK_MAX_DC_RD_ATOM = 1 << 14,
MLX5DV_CONTEXT_MASK_REG_C0 = 1 << 15,
MLX5DV_CONTEXT_MASK_OOO_RECV_WRS = 1 << 16,
};

struct mlx5dv_cqe_comp_caps {
Expand Down Expand Up @@ -213,6 +214,14 @@ struct mlx5dv_crypto_caps {
uint32_t flags; /* use enum mlx5dv_crypto_caps_flags */
};

struct mlx5dv_ooo_recv_wrs_caps {
uint32_t max_rc;
uint32_t max_xrc;
uint32_t max_dct;
uint32_t max_ud;
uint32_t max_uc;
};

/*
* Direct verbs device-specific attributes
*/
Expand All @@ -237,6 +246,7 @@ struct mlx5dv_context {
uint64_t max_dc_rd_atom;
uint64_t max_dc_init_rd_atom;
struct mlx5dv_reg reg_c0;
struct mlx5dv_ooo_recv_wrs_caps ooo_recv_wrs_caps;
};

enum mlx5dv_context_flags {
Expand Down Expand Up @@ -283,6 +293,7 @@ enum mlx5dv_qp_create_flags {
MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE = 1 << 4,
MLX5DV_QP_CREATE_PACKET_BASED_CREDIT_MODE = 1 << 5,
MLX5DV_QP_CREATE_SIG_PIPELINING = 1 << 6,
MLX5DV_QP_CREATE_OOO_DP = 1 << 7,
};

enum mlx5dv_mkey_init_attr_flags {
Expand Down
Loading