Skip to content

Commit 6f37787

Browse files
spikehaxboe
authored andcommitted
io_uring/zcrx: add interface queue and refill queue
Add a new object called an interface queue (ifq) that represents a net rx queue that has been configured for zero copy. Each ifq is registered using a new registration opcode IORING_REGISTER_ZCRX_IFQ. The refill queue is allocated by the kernel and mapped by userspace using a new offset IORING_OFF_RQ_RING, in a similar fashion to the main SQ/CQ. It is used by userspace to return buffers that it is done with, which will then be re-used by the netdev again. The main CQ ring is used to notify userspace of received data by using the upper 16 bytes of a big CQE as a new struct io_uring_zcrx_cqe. Each entry contains the offset + len to the data. For now, each io_uring instance only has a single ifq. Reviewed-by: Jens Axboe <[email protected]> Signed-off-by: David Wei <[email protected]> Acked-by: Jakub Kicinski <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]>
1 parent 5c496ff commit 6f37787

File tree

10 files changed

+260
-1
lines changed

10 files changed

+260
-1
lines changed

Kconfig

+2
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,5 @@ source "lib/Kconfig"
3030
source "lib/Kconfig.debug"
3131

3232
source "Documentation/Kconfig"
33+
34+
source "io_uring/KConfig"

include/linux/io_uring_types.h

+6
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ enum io_uring_cmd_flags {
4040
IO_URING_F_TASK_DEAD = (1 << 13),
4141
};
4242

43+
struct io_zcrx_ifq;
44+
4345
struct io_wq_work_node {
4446
struct io_wq_work_node *next;
4547
};
@@ -382,6 +384,8 @@ struct io_ring_ctx {
382384
struct wait_queue_head poll_wq;
383385
struct io_restriction restrictions;
384386

387+
struct io_zcrx_ifq *ifq;
388+
385389
u32 pers_next;
386390
struct xarray personalities;
387391

@@ -434,6 +438,8 @@ struct io_ring_ctx {
434438
struct io_mapped_region ring_region;
435439
/* used for optimised request parameter and wait argument passing */
436440
struct io_mapped_region param_region;
441+
/* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
442+
struct io_mapped_region zcrx_region;
437443
};
438444

439445
/*

include/uapi/linux/io_uring.h

+42-1
Original file line numberDiff line numberDiff line change
@@ -639,7 +639,8 @@ enum io_uring_register_op {
639639
/* send MSG_RING without having a ring */
640640
IORING_REGISTER_SEND_MSG_RING = 31,
641641

642-
/* 32 reserved for zc rx */
642+
/* register a netdev hw rx queue for zerocopy */
643+
IORING_REGISTER_ZCRX_IFQ = 32,
643644

644645
/* resize CQ ring */
645646
IORING_REGISTER_RESIZE_RINGS = 33,
@@ -956,6 +957,46 @@ enum io_uring_socket_op {
956957
SOCKET_URING_OP_SETSOCKOPT,
957958
};
958959

960+
/* Zero copy receive refill queue entry */
961+
struct io_uring_zcrx_rqe {
962+
__u64 off;
963+
__u32 len;
964+
__u32 __pad;
965+
};
966+
967+
struct io_uring_zcrx_cqe {
968+
__u64 off;
969+
__u64 __pad;
970+
};
971+
972+
/* The bit from which area id is encoded into offsets */
973+
#define IORING_ZCRX_AREA_SHIFT 48
974+
#define IORING_ZCRX_AREA_MASK (~(((__u64)1 << IORING_ZCRX_AREA_SHIFT) - 1))
975+
976+
struct io_uring_zcrx_offsets {
977+
__u32 head;
978+
__u32 tail;
979+
__u32 rqes;
980+
__u32 __resv2;
981+
__u64 __resv[2];
982+
};
983+
984+
/*
985+
* Argument for IORING_REGISTER_ZCRX_IFQ
986+
*/
987+
struct io_uring_zcrx_ifq_reg {
988+
__u32 if_idx;
989+
__u32 if_rxq;
990+
__u32 rq_entries;
991+
__u32 flags;
992+
993+
__u64 area_ptr; /* pointer to struct io_uring_zcrx_area_reg */
994+
__u64 region_ptr; /* struct io_uring_region_desc * */
995+
996+
struct io_uring_zcrx_offsets offsets;
997+
__u64 __resv[4];
998+
};
999+
9591000
#ifdef __cplusplus
9601001
}
9611002
#endif

io_uring/KConfig

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# SPDX-License-Identifier: GPL-2.0-only
2+
#
3+
# io_uring configuration
4+
#
5+
6+
config IO_URING_ZCRX
7+
def_bool y
8+
depends on PAGE_POOL
9+
depends on INET
10+
depends on NET_RX_BUSY_POLL

io_uring/Makefile

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
1414
epoll.o statx.o timeout.o fdinfo.o \
1515
cancel.o waitid.o register.o \
1616
truncate.o memmap.o alloc_cache.o
17+
obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o
1718
obj-$(CONFIG_IO_WQ) += io-wq.o
1819
obj-$(CONFIG_FUTEX) += futex.o
1920
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o

io_uring/io_uring.c

+7
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@
9797
#include "uring_cmd.h"
9898
#include "msg_ring.h"
9999
#include "memmap.h"
100+
#include "zcrx.h"
100101

101102
#include "timeout.h"
102103
#include "poll.h"
@@ -2700,6 +2701,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
27002701
mutex_lock(&ctx->uring_lock);
27012702
io_sqe_buffers_unregister(ctx);
27022703
io_sqe_files_unregister(ctx);
2704+
io_unregister_zcrx_ifqs(ctx);
27032705
io_cqring_overflow_kill(ctx);
27042706
io_eventfd_unregister(ctx);
27052707
io_free_alloc_caches(ctx);
@@ -2859,6 +2861,11 @@ static __cold void io_ring_exit_work(struct work_struct *work)
28592861
io_cqring_overflow_kill(ctx);
28602862
mutex_unlock(&ctx->uring_lock);
28612863
}
2864+
if (ctx->ifq) {
2865+
mutex_lock(&ctx->uring_lock);
2866+
io_shutdown_zcrx_ifqs(ctx);
2867+
mutex_unlock(&ctx->uring_lock);
2868+
}
28622869

28632870
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
28642871
io_move_task_work_from_local(ctx);

io_uring/memmap.h

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define IO_URING_MEMMAP_H
33

44
#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL
5+
#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL
56

67
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
78

io_uring/register.c

+7
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "eventfd.h"
3131
#include "msg_ring.h"
3232
#include "memmap.h"
33+
#include "zcrx.h"
3334

3435
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
3536
IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -813,6 +814,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
813814
break;
814815
ret = io_register_clone_buffers(ctx, arg);
815816
break;
817+
case IORING_REGISTER_ZCRX_IFQ:
818+
ret = -EINVAL;
819+
if (!arg || nr_args != 1)
820+
break;
821+
ret = io_register_zcrx_ifq(ctx, arg);
822+
break;
816823
case IORING_REGISTER_RESIZE_RINGS:
817824
ret = -EINVAL;
818825
if (!arg || nr_args != 1)

io_uring/zcrx.c

+149
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#include <linux/kernel.h>
3+
#include <linux/errno.h>
4+
#include <linux/mm.h>
5+
#include <linux/io_uring.h>
6+
7+
#include <uapi/linux/io_uring.h>
8+
9+
#include "io_uring.h"
10+
#include "kbuf.h"
11+
#include "memmap.h"
12+
#include "zcrx.h"
13+
14+
#define IO_RQ_MAX_ENTRIES 32768
15+
16+
static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
17+
struct io_uring_zcrx_ifq_reg *reg,
18+
struct io_uring_region_desc *rd)
19+
{
20+
size_t off, size;
21+
void *ptr;
22+
int ret;
23+
24+
off = sizeof(struct io_uring);
25+
size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
26+
if (size > rd->size)
27+
return -EINVAL;
28+
29+
ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
30+
IORING_MAP_OFF_ZCRX_REGION);
31+
if (ret < 0)
32+
return ret;
33+
34+
ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
35+
ifq->rq_ring = (struct io_uring *)ptr;
36+
ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
37+
return 0;
38+
}
39+
40+
static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
41+
{
42+
io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
43+
ifq->rq_ring = NULL;
44+
ifq->rqes = NULL;
45+
}
46+
47+
static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx)
48+
{
49+
struct io_zcrx_ifq *ifq;
50+
51+
ifq = kzalloc(sizeof(*ifq), GFP_KERNEL);
52+
if (!ifq)
53+
return NULL;
54+
55+
ifq->if_rxq = -1;
56+
ifq->ctx = ctx;
57+
return ifq;
58+
}
59+
60+
static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
61+
{
62+
io_free_rbuf_ring(ifq);
63+
kfree(ifq);
64+
}
65+
66+
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
67+
struct io_uring_zcrx_ifq_reg __user *arg)
68+
{
69+
struct io_uring_zcrx_ifq_reg reg;
70+
struct io_uring_region_desc rd;
71+
struct io_zcrx_ifq *ifq;
72+
int ret;
73+
74+
/*
75+
* 1. Interface queue allocation.
76+
* 2. It can observe data destined for sockets of other tasks.
77+
*/
78+
if (!capable(CAP_NET_ADMIN))
79+
return -EPERM;
80+
81+
/* mandatory io_uring features for zc rx */
82+
if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
83+
ctx->flags & IORING_SETUP_CQE32))
84+
return -EINVAL;
85+
if (ctx->ifq)
86+
return -EBUSY;
87+
if (copy_from_user(&reg, arg, sizeof(reg)))
88+
return -EFAULT;
89+
if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
90+
return -EFAULT;
91+
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
92+
return -EINVAL;
93+
if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags)
94+
return -EINVAL;
95+
if (reg.rq_entries > IO_RQ_MAX_ENTRIES) {
96+
if (!(ctx->flags & IORING_SETUP_CLAMP))
97+
return -EINVAL;
98+
reg.rq_entries = IO_RQ_MAX_ENTRIES;
99+
}
100+
reg.rq_entries = roundup_pow_of_two(reg.rq_entries);
101+
102+
if (!reg.area_ptr)
103+
return -EFAULT;
104+
105+
ifq = io_zcrx_ifq_alloc(ctx);
106+
if (!ifq)
107+
return -ENOMEM;
108+
109+
ret = io_allocate_rbuf_ring(ifq, &reg, &rd);
110+
if (ret)
111+
goto err;
112+
113+
ifq->rq_entries = reg.rq_entries;
114+
ifq->if_rxq = reg.if_rxq;
115+
116+
reg.offsets.rqes = sizeof(struct io_uring);
117+
reg.offsets.head = offsetof(struct io_uring, head);
118+
reg.offsets.tail = offsetof(struct io_uring, tail);
119+
120+
if (copy_to_user(arg, &reg, sizeof(reg)) ||
121+
copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd))) {
122+
ret = -EFAULT;
123+
goto err;
124+
}
125+
126+
ctx->ifq = ifq;
127+
return 0;
128+
err:
129+
io_zcrx_ifq_free(ifq);
130+
return ret;
131+
}
132+
133+
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
134+
{
135+
struct io_zcrx_ifq *ifq = ctx->ifq;
136+
137+
lockdep_assert_held(&ctx->uring_lock);
138+
139+
if (!ifq)
140+
return;
141+
142+
ctx->ifq = NULL;
143+
io_zcrx_ifq_free(ifq);
144+
}
145+
146+
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
147+
{
148+
lockdep_assert_held(&ctx->uring_lock);
149+
}

io_uring/zcrx.h

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#ifndef IOU_ZC_RX_H
3+
#define IOU_ZC_RX_H
4+
5+
#include <linux/io_uring_types.h>
6+
7+
struct io_zcrx_ifq {
8+
struct io_ring_ctx *ctx;
9+
struct io_uring *rq_ring;
10+
struct io_uring_zcrx_rqe *rqes;
11+
u32 rq_entries;
12+
13+
u32 if_rxq;
14+
};
15+
16+
#if defined(CONFIG_IO_URING_ZCRX)
17+
int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
18+
struct io_uring_zcrx_ifq_reg __user *arg);
19+
void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx);
20+
void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx);
21+
#else
22+
static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
23+
struct io_uring_zcrx_ifq_reg __user *arg)
24+
{
25+
return -EOPNOTSUPP;
26+
}
27+
static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
28+
{
29+
}
30+
static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
31+
{
32+
}
33+
#endif
34+
35+
#endif

0 commit comments

Comments
 (0)