Skip to content

Commit

Permalink
support for shadow ioeventfd (#698)
Browse files Browse the repository at this point in the history
When an ioeventfd is written to, KVM discards the value since it has no
memory to write it to, and simply kicks the eventfd. This a problem for
devices such a NVMe controllers that need the value (e.g. doorbells on
BAR0). This patch allows the vfio-user server to pass a file descriptor
that can be mmap'ed and KVM can write the ioeventfd value to this
_shadow_ memory instead of discarding it. This shadow memory is not
exposed to the guest.

Signed-off-by: Thanos Makatos <[email protected]>
Reviewed-by: John Levon <[email protected]>
Change-Id: Iad849c94076ffa5988e034c8bf7ec312d01f095f
  • Loading branch information
tmakatos authored Jul 4, 2022
1 parent 1c27402 commit 36beb63
Show file tree
Hide file tree
Showing 12 changed files with 184 additions and 13 deletions.
25 changes: 25 additions & 0 deletions docs/ioregionfd.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# ioregionfd

ioregionfd is a mechanism that speeds up ioeventfds:
https://lore.kernel.org/kvm/[email protected]/. In the
author's original words: "ioregionfd is a KVM dispatch mechanism which can be
used for handling MMIO/PIO accesses over file descriptors without returning
from ioctl(KVM_RUN).".

libvfio-user currently supports an experimental variant of this mechanism
called shadow ioeventfd. A shadow ioeventfd is a normal ioeventfd where the
vfio-user server passes another piece of memory (called the _shadow_ memory)
via an additional file descriptor when configuring the ioregionfd, which then
QEMU memory maps and passes this address to KVM. This shadow memory is never
exposed to the guest. When the guest writes to the trapped memory, KVM writes
the value to the shadow memory instread of discarding it, and then proceeds
kicking the eventfd as normal.

To use shadow ioeventfd, the kernel and QEMU need to be patched. The QEMU patch
is designed specifically for SPDK's doorbells (one ioregionfd of 4K in BAR0);
it should be trivial to extend.

The list of patches:
* kernel: https://gist.github.com/tmakatos/532afd092a8df2175120d3dbfcd719ef
* QEMU: https://gist.github.com/tmakatos/57755d2a37a6d53c9ff392e7c34470f6
* SPDK: https://gist.github.com/tmakatos/f6c10fdaff59c9d629f94bd8e44a53bc
10 changes: 8 additions & 2 deletions include/libvfio-user.h
Original file line number Diff line number Diff line change
Expand Up @@ -1069,12 +1069,18 @@ vfu_sg_is_mappable(vfu_ctx_t *vfu_ctx, dma_sg_t *sg);
* @size: size of the ioeventfd
* @flags: Any flags to set up the ioeventfd
* @datamatch: sets the datamatch value
* @shadow_fd: File descriptor that can be mmap'ed, KVM will write there the
* otherwise discarded value when the ioeventfd is written to. If set to -1
* then a normal ioeventfd is set up instead of a shadow one. The vfio-user
* client is free to ignore this, even if it supports shadow ioeventfds.
* Requires a kernel with shadow ioeventfd support.
* Experimental, must be compiled with SHADOW_IOEVENTFD defined, otherwise
* must be -1.
*/
int
vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd,
size_t offset, uint32_t size, uint32_t flags,
uint64_t datamatch);

uint64_t datamatch, int shadow_fd);
#ifdef __cplusplus
}
#endif
Expand Down
1 change: 1 addition & 0 deletions include/vfio-user.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ typedef struct vfio_user_region_io_fds_request {

#define VFIO_USER_IO_FD_TYPE_IOEVENTFD 0
#define VFIO_USER_IO_FD_TYPE_IOREGIONFD 1
#define VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW 2

typedef struct vfio_user_sub_region_ioeventfd {
uint64_t offset;
Expand Down
24 changes: 21 additions & 3 deletions lib/libvfio-user.c
Original file line number Diff line number Diff line change
Expand Up @@ -467,13 +467,19 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
EXPORT int
vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd,
size_t offset, uint32_t size, uint32_t flags,
uint64_t datamatch)
uint64_t datamatch, int shadow_fd)
{
vfu_reg_info_t *vfu_reg;

assert(vfu_ctx != NULL);
assert(fd >= 0);

#ifndef SHADOW_IOEVENTFD
if (shadow_fd != -1) {
return ERROR_INT(EINVAL);
}
#endif

if (region_idx >= VFU_PCI_DEV_NUM_REGIONS) {
return ERROR_INT(EINVAL);
}
Expand All @@ -494,6 +500,7 @@ vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd,
elem->size = size;
elem->flags = flags;
elem->datamatch = datamatch;
elem->shadow_fd = shadow_fd;
LIST_INSERT_HEAD(&vfu_reg->subregions, elem, entry);

return 0;
Expand Down Expand Up @@ -555,6 +562,7 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
ioeventfd_t *sub_reg = NULL;
size_t nr_sub_reg = 0;
size_t i = 0;
size_t nr_shadow_reg = 0;

assert(vfu_ctx != NULL);
assert(msg != NULL);
Expand Down Expand Up @@ -585,6 +593,9 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)

LIST_FOREACH(sub_reg, &vfu_reg->subregions, entry) {
nr_sub_reg++;
if (sub_reg->shadow_fd != -1) {
nr_shadow_reg++;
}
}

if (req->argsz < sizeof(vfio_user_region_io_fds_reply_t) ||
Expand Down Expand Up @@ -614,7 +625,8 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)

msg->out.nr_fds = 0;
if (req->argsz >= reply->argsz) {
msg->out.fds = calloc(sizeof(int), max_sent_sub_regions);
msg->out.fds = calloc(sizeof(int),
max_sent_sub_regions + nr_shadow_reg);
if (msg->out.fds == NULL) {
return -1;
}
Expand All @@ -627,7 +639,13 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg)
ioefd->size = sub_reg->size;
ioefd->fd_index = add_fd_index(msg->out.fds, &msg->out.nr_fds,
sub_reg->fd);
ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD;
if (sub_reg->shadow_fd == -1) {
ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD;
} else {
ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW;
int ret = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->shadow_fd);
assert(ret == 1);
}
ioefd->flags = sub_reg->flags;
ioefd->datamatch = sub_reg->datamatch;

Expand Down
1 change: 1 addition & 0 deletions lib/private.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ typedef struct ioeventfd {
int32_t fd;
uint32_t flags;
uint64_t datamatch;
int32_t shadow_fd;
LIST_ENTRY(ioeventfd) entry;
} ioeventfd_t;

Expand Down
5 changes: 5 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ opt_tran_pipe = get_option('tran-pipe')
opt_debug_logs = get_option('debug-logs')
opt_sanitizers = get_option('b_sanitize')
opt_debug = get_option('debug')
opt_shadow_ioeventfd = get_option('shadow-ioeventfd')

cc = meson.get_compiler('c')

Expand Down Expand Up @@ -57,6 +58,10 @@ if opt_debug_logs.enabled() or (not opt_debug_logs.disabled() and opt_debug)
common_cflags += ['-DDEBUG']
endif

if opt_shadow_ioeventfd
common_cflags += ['-DSHADOW_IOEVENTFD']
endif

if get_option('warning_level') == '2'
# -Wall is set for 'warning_level>=1'
# -Wextra is set for 'warning_level>=2'
Expand Down
2 changes: 2 additions & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ option('tran-pipe', type: 'boolean', value: false,
description: 'enable pipe transport for testing')
option('debug-logs', type: 'feature', value: 'auto',
description: 'enable extra debugging code (default for debug builds)')
option('shadow-ioeventfd', type: 'boolean', value : false,
description: 'enable shadow ioeventfd (experimental)')
15 changes: 12 additions & 3 deletions test/py/libvfio_user.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@

VFIO_USER_IO_FD_TYPE_IOEVENTFD = 0
VFIO_USER_IO_FD_TYPE_IOREGIONFD = 1
VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW = 2


# enum vfu_dev_irq_type
Expand Down Expand Up @@ -620,7 +621,7 @@ class vfio_user_migration_info(Structure):

lib.vfu_create_ioeventfd.argtypes = (c.c_void_p, c.c_uint32, c.c_int,
c.c_size_t, c.c_uint32, c.c_uint32,
c.c_uint64)
c.c_uint64, c.c_int32)

lib.vfu_device_quiesced.argtypes = (c.c_void_p, c.c_int)

Expand All @@ -635,6 +636,10 @@ def to_byte(val):
return val.to_bytes(1, 'little')


def to_bytes_le(n, length=1):
return n.to_bytes(length, 'little')


def skip(fmt, buf):
"""Return the data remaining after skipping the given elements."""
return buf[struct.calcsize(fmt):]
Expand All @@ -645,6 +650,9 @@ def parse_json(json_str):
return json.loads(json_str, object_hook=lambda d: SimpleNamespace(**d))


IOEVENT_SIZE = 8


def eventfd(initval=0, flags=0):
libc.eventfd.argtypes = (c.c_uint, c.c_int)
return libc.eventfd(initval, flags)
Expand Down Expand Up @@ -1184,11 +1192,12 @@ def vfu_sgl_put(ctx, sg, iovec, cnt=1):
return lib.vfu_sgl_put(ctx, sg, iovec, cnt)


def vfu_create_ioeventfd(ctx, region_idx, fd, offset, size, flags, datamatch):
def vfu_create_ioeventfd(ctx, region_idx, fd, offset, size, flags, datamatch,
shadow_fd=-1):
assert ctx is not None

return lib.vfu_create_ioeventfd(ctx, region_idx, fd, offset, size,
flags, datamatch)
flags, datamatch, shadow_fd)


def vfu_device_quiesced(ctx, err):
Expand Down
4 changes: 4 additions & 0 deletions test/py/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ python_tests = [
'test_vfu_realize_ctx.py',
]

if get_option('shadow-ioeventfd')
python_tests += 'test_shadow_ioeventfd.py'
endif

python_files = python_tests_common + python_tests

if pytest.found() and opt_sanitizers == 'none'
Expand Down
1 change: 0 additions & 1 deletion test/py/test_device_get_region_io_fds.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
ctx = None
sock = None
fds = []
IOEVENT_SIZE = 8


def test_device_get_region_io_fds_setup():
Expand Down
4 changes: 0 additions & 4 deletions test/py/test_pci_caps.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,10 +349,6 @@ def test_pci_cap_write_px(mock_quiesce, mock_reset):
expect=errno.EINVAL)


def to_bytes_le(n, length=1):
return n.to_bytes(length, 'little')


def test_pci_cap_write_msix():
setup_pci_dev(realize=True)
sock = connect_client(ctx)
Expand Down
105 changes: 105 additions & 0 deletions test/py/test_shadow_ioeventfd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#
# Copyright (c) 2022 Nutanix Inc. All rights reserved.
#
# Authors: Thanos Makatos <[email protected]>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of Nutanix nor the names of its contributors may be
# used to endorse or promote products derived from this software without
# specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.
#

from libvfio_user import *
import tempfile
import mmap
import errno


def test_shadow_ioeventfd():
"""Configure a shadow ioeventfd, have the client trigger it, confirm that
the server receives the notification and can see the value."""

# server setup
ctx = vfu_create_ctx(flags=LIBVFIO_USER_FLAG_ATTACH_NB)
assert ctx is not None
ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_BAR0_REGION_IDX, size=0x1000,
flags=VFU_REGION_FLAG_RW)
assert ret == 0
fo = tempfile.TemporaryFile(dir="/dev/shm")
fo.truncate(0x1000)

# FIXME
# Use pip install eventfd?
# $ grep EFD_NONBLOCK -wr /usr/include/
# /usr/include/bits/eventfd.h: EFD_NONBLOCK = 00004000
EFD_NONBLOCK = 0o00004000

efd = eventfd(flags=EFD_NONBLOCK)
ret = vfu_create_ioeventfd(ctx, VFU_PCI_DEV_BAR0_REGION_IDX, efd, 0x8,
0x16, 0, 0, shadow_fd=fo.fileno())
assert ret == 0
ret = vfu_realize_ctx(ctx)
assert ret == 0

# client queries I/O region FDs
sock = connect_client(ctx)
payload = vfio_user_region_io_fds_request(
argsz=len(vfio_user_region_io_fds_reply()) +
len(vfio_user_sub_region_ioeventfd()), flags=0,
index=VFU_PCI_DEV_BAR0_REGION_IDX, count=0)
newfds, ret = msg_fds(ctx, sock, VFIO_USER_DEVICE_GET_REGION_IO_FDS,
payload, expect=0)
reply, ret = vfio_user_region_io_fds_reply.pop_from_buffer(ret)
assert reply.count == 1 # 1 eventfd
ioevent, _ = vfio_user_sub_region_ioeventfd.pop_from_buffer(ret)
assert ioevent.offset == 0x8
assert ioevent.size == 0x16
assert ioevent.fd_index == 0
assert ioevent.type == VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW
assert ioevent.flags == 0
assert ioevent.datamatch == 0

assert len(newfds) == 2 # 2 FDs: eventfd plus shadow FD
cefd = newfds[0]
csfd = newfds[1]
cmem = mmap.mmap(csfd, 0x1000)

# vfio-user app reads the eventfd, there should be nothing there
try:
os.read(efd, IOEVENT_SIZE)
except BlockingIOError as e:
if e.errno != errno.EAGAIN:
assert False
else:
assert False

# Client writes to the I/O region. The write to the eventfd would be done
# by KVM and the value would be the same in both cases.
cmem.seek(0x8)
cmem.write(c.c_ulonglong(0xdeadbeef))
os.write(cefd, c.c_ulonglong(0xcafebabe))

# vfio-user app reads eventfd
assert os.read(efd, IOEVENT_SIZE) == to_bytes_le(0xcafebabe, 8)
fo.seek(0x8)
assert fo.read(0x8) == to_bytes_le(0xdeadbeef, 8)

vfu_destroy_ctx(ctx)

0 comments on commit 36beb63

Please sign in to comment.