Skip to content

Commit 200f3ab

Browse files
committed
io_uring/eventfd: move eventfd handling to separate file
This is pretty nicely abstracted already, but let's move it to a separate file rather than have it in the main io_uring file. With that, we can also move the io_ev_fd struct and enum out of global scope. Signed-off-by: Jens Axboe <[email protected]>
1 parent 60b6c07 commit 200f3ab

File tree

7 files changed

+173
-153
lines changed

7 files changed

+173
-153
lines changed

include/linux/io_uring_types.h

-8
Original file line numberDiff line numberDiff line change
@@ -211,14 +211,6 @@ struct io_submit_state {
211211
struct blk_plug plug;
212212
};
213213

214-
struct io_ev_fd {
215-
struct eventfd_ctx *cq_ev_fd;
216-
unsigned int eventfd_async: 1;
217-
struct rcu_head rcu;
218-
atomic_t refs;
219-
atomic_t ops;
220-
};
221-
222214
struct io_alloc_cache {
223215
void **entries;
224216
unsigned int nr_cached;

io_uring/Makefile

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
66
tctx.o filetable.o rw.o net.o poll.o \
7-
uring_cmd.o openclose.o sqpoll.o \
8-
xattr.o nop.o fs.o splice.o sync.o \
9-
msg_ring.o advise.o openclose.o \
7+
eventfd.o uring_cmd.o openclose.o \
8+
sqpoll.o xattr.o nop.o fs.o splice.o \
9+
sync.o msg_ring.o advise.o openclose.o \
1010
epoll.o statx.o timeout.o fdinfo.o \
1111
cancel.o waitid.o register.o \
1212
truncate.o memmap.o

io_uring/eventfd.c

+160
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
// SPDX-License-Identifier: GPL-2.0
2+
#include <linux/kernel.h>
3+
#include <linux/errno.h>
4+
#include <linux/mm.h>
5+
#include <linux/slab.h>
6+
#include <linux/eventfd.h>
7+
#include <linux/eventpoll.h>
8+
#include <linux/io_uring.h>
9+
#include <linux/io_uring_types.h>
10+
11+
#include "io-wq.h"
12+
#include "eventfd.h"
13+
14+
struct io_ev_fd {
15+
struct eventfd_ctx *cq_ev_fd;
16+
unsigned int eventfd_async: 1;
17+
struct rcu_head rcu;
18+
atomic_t refs;
19+
atomic_t ops;
20+
};
21+
22+
enum {
23+
IO_EVENTFD_OP_SIGNAL_BIT,
24+
};
25+
26+
static void io_eventfd_free(struct rcu_head *rcu)
27+
{
28+
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
29+
30+
eventfd_ctx_put(ev_fd->cq_ev_fd);
31+
kfree(ev_fd);
32+
}
33+
34+
static void io_eventfd_do_signal(struct rcu_head *rcu)
35+
{
36+
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
37+
38+
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
39+
40+
if (atomic_dec_and_test(&ev_fd->refs))
41+
io_eventfd_free(rcu);
42+
}
43+
44+
void io_eventfd_signal(struct io_ring_ctx *ctx)
45+
{
46+
struct io_ev_fd *ev_fd = NULL;
47+
48+
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
49+
return;
50+
51+
guard(rcu)();
52+
53+
/*
54+
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
55+
* and eventfd_signal
56+
*/
57+
ev_fd = rcu_dereference(ctx->io_ev_fd);
58+
59+
/*
60+
* Check again if ev_fd exists incase an io_eventfd_unregister call
61+
* completed between the NULL check of ctx->io_ev_fd at the start of
62+
* the function and rcu_read_lock.
63+
*/
64+
if (unlikely(!ev_fd))
65+
return;
66+
if (!atomic_inc_not_zero(&ev_fd->refs))
67+
return;
68+
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
69+
goto out;
70+
71+
if (likely(eventfd_signal_allowed())) {
72+
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
73+
} else {
74+
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
75+
call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
76+
return;
77+
}
78+
}
79+
out:
80+
if (atomic_dec_and_test(&ev_fd->refs))
81+
call_rcu(&ev_fd->rcu, io_eventfd_free);
82+
}
83+
84+
void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
85+
{
86+
bool skip;
87+
88+
spin_lock(&ctx->completion_lock);
89+
90+
/*
91+
* Eventfd should only get triggered when at least one event has been
92+
* posted. Some applications rely on the eventfd notification count
93+
* only changing IFF a new CQE has been added to the CQ ring. There's
94+
* no depedency on 1:1 relationship between how many times this
95+
* function is called (and hence the eventfd count) and number of CQEs
96+
* posted to the CQ ring.
97+
*/
98+
skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
99+
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
100+
spin_unlock(&ctx->completion_lock);
101+
if (skip)
102+
return;
103+
104+
io_eventfd_signal(ctx);
105+
}
106+
107+
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
108+
unsigned int eventfd_async)
109+
{
110+
struct io_ev_fd *ev_fd;
111+
__s32 __user *fds = arg;
112+
int fd;
113+
114+
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
115+
lockdep_is_held(&ctx->uring_lock));
116+
if (ev_fd)
117+
return -EBUSY;
118+
119+
if (copy_from_user(&fd, fds, sizeof(*fds)))
120+
return -EFAULT;
121+
122+
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
123+
if (!ev_fd)
124+
return -ENOMEM;
125+
126+
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
127+
if (IS_ERR(ev_fd->cq_ev_fd)) {
128+
int ret = PTR_ERR(ev_fd->cq_ev_fd);
129+
kfree(ev_fd);
130+
return ret;
131+
}
132+
133+
spin_lock(&ctx->completion_lock);
134+
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
135+
spin_unlock(&ctx->completion_lock);
136+
137+
ev_fd->eventfd_async = eventfd_async;
138+
ctx->has_evfd = true;
139+
atomic_set(&ev_fd->refs, 1);
140+
atomic_set(&ev_fd->ops, 0);
141+
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
142+
return 0;
143+
}
144+
145+
int io_eventfd_unregister(struct io_ring_ctx *ctx)
146+
{
147+
struct io_ev_fd *ev_fd;
148+
149+
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
150+
lockdep_is_held(&ctx->uring_lock));
151+
if (ev_fd) {
152+
ctx->has_evfd = false;
153+
rcu_assign_pointer(ctx->io_ev_fd, NULL);
154+
if (atomic_dec_and_test(&ev_fd->refs))
155+
call_rcu(&ev_fd->rcu, io_eventfd_free);
156+
return 0;
157+
}
158+
159+
return -ENXIO;
160+
}

io_uring/eventfd.h

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
2+
struct io_ring_ctx;
3+
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
4+
unsigned int eventfd_async);
5+
int io_eventfd_unregister(struct io_ring_ctx *ctx);
6+
7+
void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
8+
void io_eventfd_signal(struct io_ring_ctx *ctx);

io_uring/io_uring.c

+1-81
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
#include "poll.h"
102102
#include "rw.h"
103103
#include "alloc_cache.h"
104+
#include "eventfd.h"
104105

105106
#define IORING_MAX_ENTRIES 32768
106107
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
@@ -541,87 +542,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
541542
}
542543
}
543544

544-
void io_eventfd_free(struct rcu_head *rcu)
545-
{
546-
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
547-
548-
eventfd_ctx_put(ev_fd->cq_ev_fd);
549-
kfree(ev_fd);
550-
}
551-
552-
void io_eventfd_do_signal(struct rcu_head *rcu)
553-
{
554-
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
555-
556-
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
557-
558-
if (atomic_dec_and_test(&ev_fd->refs))
559-
io_eventfd_free(rcu);
560-
}
561-
562-
static void io_eventfd_signal(struct io_ring_ctx *ctx)
563-
{
564-
struct io_ev_fd *ev_fd = NULL;
565-
566-
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
567-
return;
568-
569-
guard(rcu)();
570-
571-
/*
572-
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
573-
* and eventfd_signal
574-
*/
575-
ev_fd = rcu_dereference(ctx->io_ev_fd);
576-
577-
/*
578-
* Check again if ev_fd exists incase an io_eventfd_unregister call
579-
* completed between the NULL check of ctx->io_ev_fd at the start of
580-
* the function and rcu_read_lock.
581-
*/
582-
if (unlikely(!ev_fd))
583-
return;
584-
if (!atomic_inc_not_zero(&ev_fd->refs))
585-
return;
586-
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
587-
goto out;
588-
589-
if (likely(eventfd_signal_allowed())) {
590-
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
591-
} else {
592-
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
593-
call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
594-
return;
595-
}
596-
}
597-
out:
598-
if (atomic_dec_and_test(&ev_fd->refs))
599-
call_rcu(&ev_fd->rcu, io_eventfd_free);
600-
}
601-
602-
static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
603-
{
604-
bool skip;
605-
606-
spin_lock(&ctx->completion_lock);
607-
608-
/*
609-
* Eventfd should only get triggered when at least one event has been
610-
* posted. Some applications rely on the eventfd notification count
611-
* only changing IFF a new CQE has been added to the CQ ring. There's
612-
* no depedency on 1:1 relationship between how many times this
613-
* function is called (and hence the eventfd count) and number of CQEs
614-
* posted to the CQ ring.
615-
*/
616-
skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
617-
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
618-
spin_unlock(&ctx->completion_lock);
619-
if (skip)
620-
return;
621-
622-
io_eventfd_signal(ctx);
623-
}
624-
625545
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
626546
{
627547
if (ctx->poll_activated)

io_uring/io_uring.h

-6
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
104104
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
105105
bool cancel_all);
106106

107-
enum {
108-
IO_EVENTFD_OP_SIGNAL_BIT,
109-
};
110-
111-
void io_eventfd_do_signal(struct rcu_head *rcu);
112-
void io_eventfd_free(struct rcu_head *rcu);
113107
void io_activate_pollwq(struct io_ring_ctx *ctx);
114108

115109
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)

io_uring/register.c

+1-55
Original file line numberDiff line numberDiff line change
@@ -27,65 +27,11 @@
2727
#include "cancel.h"
2828
#include "kbuf.h"
2929
#include "napi.h"
30+
#include "eventfd.h"
3031

3132
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
3233
IORING_REGISTER_LAST + IORING_OP_LAST)
3334

34-
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
35-
unsigned int eventfd_async)
36-
{
37-
struct io_ev_fd *ev_fd;
38-
__s32 __user *fds = arg;
39-
int fd;
40-
41-
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
42-
lockdep_is_held(&ctx->uring_lock));
43-
if (ev_fd)
44-
return -EBUSY;
45-
46-
if (copy_from_user(&fd, fds, sizeof(*fds)))
47-
return -EFAULT;
48-
49-
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
50-
if (!ev_fd)
51-
return -ENOMEM;
52-
53-
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
54-
if (IS_ERR(ev_fd->cq_ev_fd)) {
55-
int ret = PTR_ERR(ev_fd->cq_ev_fd);
56-
kfree(ev_fd);
57-
return ret;
58-
}
59-
60-
spin_lock(&ctx->completion_lock);
61-
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
62-
spin_unlock(&ctx->completion_lock);
63-
64-
ev_fd->eventfd_async = eventfd_async;
65-
ctx->has_evfd = true;
66-
atomic_set(&ev_fd->refs, 1);
67-
atomic_set(&ev_fd->ops, 0);
68-
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
69-
return 0;
70-
}
71-
72-
int io_eventfd_unregister(struct io_ring_ctx *ctx)
73-
{
74-
struct io_ev_fd *ev_fd;
75-
76-
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
77-
lockdep_is_held(&ctx->uring_lock));
78-
if (ev_fd) {
79-
ctx->has_evfd = false;
80-
rcu_assign_pointer(ctx->io_ev_fd, NULL);
81-
if (atomic_dec_and_test(&ev_fd->refs))
82-
call_rcu(&ev_fd->rcu, io_eventfd_free);
83-
return 0;
84-
}
85-
86-
return -ENXIO;
87-
}
88-
8935
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
9036
unsigned nr_args)
9137
{

0 commit comments

Comments
 (0)