diff --git a/docs/ioregionfd.md b/docs/ioregionfd.md new file mode 100644 index 00000000..c09b077c --- /dev/null +++ b/docs/ioregionfd.md @@ -0,0 +1,25 @@ +# ioregionfd + +ioregionfd is a mechanism that speeds up ioeventfds: +https://lore.kernel.org/kvm/cover.1613828726.git.eafanasova@gmail.com/. In the +author's original words: "ioregionfd is a KVM dispatch mechanism which can be +used for handling MMIO/PIO accesses over file descriptors without returning +from ioctl(KVM_RUN).". + +libvfio-user currently supports an experimental variant of this mechanism +called shadow ioeventfd. A shadow ioeventfd is a normal ioeventfd where the +vfio-user server passes another piece of memory (called the _shadow_ memory) +via an additional file descriptor when configuring the ioregionfd, which then +QEMU memory maps and passes this address to KVM. This shadow memory is never +exposed to the guest. When the guest writes to the trapped memory, KVM writes +the value to the shadow memory instread of discarding it, and then proceeds +kicking the eventfd as normal. + +To use shadow ioeventfd, the kernel and QEMU need to be patched. The QEMU patch +is designed specifically for SPDK's doorbells (one ioregionfd of 4K in BAR0); +it should be trivial to extend. + +The list of patches: +* kernel: https://gist.github.com/tmakatos/532afd092a8df2175120d3dbfcd719ef +* QEMU: https://gist.github.com/tmakatos/57755d2a37a6d53c9ff392e7c34470f6 +* SPDK: https://gist.github.com/tmakatos/f6c10fdaff59c9d629f94bd8e44a53bc diff --git a/include/libvfio-user.h b/include/libvfio-user.h index 84eb2d88..d12d89f0 100644 --- a/include/libvfio-user.h +++ b/include/libvfio-user.h @@ -1069,12 +1069,18 @@ vfu_sg_is_mappable(vfu_ctx_t *vfu_ctx, dma_sg_t *sg); * @size: size of the ioeventfd * @flags: Any flags to set up the ioeventfd * @datamatch: sets the datamatch value + * @shadow_fd: File descriptor that can be mmap'ed, KVM will write there the + * otherwise discarded value when the ioeventfd is written to. If set to -1 + * then a normal ioeventfd is set up instead of a shadow one. The vfio-user + * client is free to ignore this, even if it supports shadow ioeventfds. + * Requires a kernel with shadow ioeventfd support. + * Experimental, must be compiled with SHADOW_IOEVENTFD defined, otherwise + * must be -1. */ int vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, size_t offset, uint32_t size, uint32_t flags, - uint64_t datamatch); - + uint64_t datamatch, int shadow_fd); #ifdef __cplusplus } #endif diff --git a/include/vfio-user.h b/include/vfio-user.h index 7439484b..dc6fafaa 100644 --- a/include/vfio-user.h +++ b/include/vfio-user.h @@ -168,6 +168,7 @@ typedef struct vfio_user_region_io_fds_request { #define VFIO_USER_IO_FD_TYPE_IOEVENTFD 0 #define VFIO_USER_IO_FD_TYPE_IOREGIONFD 1 +#define VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW 2 typedef struct vfio_user_sub_region_ioeventfd { uint64_t offset; diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c index ac04d3ba..5ce57679 100644 --- a/lib/libvfio-user.c +++ b/lib/libvfio-user.c @@ -467,13 +467,19 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) EXPORT int vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, size_t offset, uint32_t size, uint32_t flags, - uint64_t datamatch) + uint64_t datamatch, int shadow_fd) { vfu_reg_info_t *vfu_reg; assert(vfu_ctx != NULL); assert(fd >= 0); +#ifndef SHADOW_IOEVENTFD + if (shadow_fd != -1) { + return ERROR_INT(EINVAL); + } +#endif + if (region_idx >= VFU_PCI_DEV_NUM_REGIONS) { return ERROR_INT(EINVAL); } @@ -494,6 +500,7 @@ vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, elem->size = size; elem->flags = flags; elem->datamatch = datamatch; + elem->shadow_fd = shadow_fd; LIST_INSERT_HEAD(&vfu_reg->subregions, elem, entry); return 0; @@ -555,6 +562,7 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) ioeventfd_t *sub_reg = NULL; size_t nr_sub_reg = 0; size_t i = 0; + size_t nr_shadow_reg = 0; assert(vfu_ctx != NULL); assert(msg != NULL); @@ -585,6 +593,9 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) LIST_FOREACH(sub_reg, &vfu_reg->subregions, entry) { nr_sub_reg++; + if (sub_reg->shadow_fd != -1) { + nr_shadow_reg++; + } } if (req->argsz < sizeof(vfio_user_region_io_fds_reply_t) || @@ -614,7 +625,8 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) msg->out.nr_fds = 0; if (req->argsz >= reply->argsz) { - msg->out.fds = calloc(sizeof(int), max_sent_sub_regions); + msg->out.fds = calloc(sizeof(int), + max_sent_sub_regions + nr_shadow_reg); if (msg->out.fds == NULL) { return -1; } @@ -627,7 +639,13 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) ioefd->size = sub_reg->size; ioefd->fd_index = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->fd); - ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD; + if (sub_reg->shadow_fd == -1) { + ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD; + } else { + ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW; + int ret = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->shadow_fd); + assert(ret == 1); + } ioefd->flags = sub_reg->flags; ioefd->datamatch = sub_reg->datamatch; diff --git a/lib/private.h b/lib/private.h index 7ffd6bec..b8751385 100644 --- a/lib/private.h +++ b/lib/private.h @@ -186,6 +186,7 @@ typedef struct ioeventfd { int32_t fd; uint32_t flags; uint64_t datamatch; + int32_t shadow_fd; LIST_ENTRY(ioeventfd) entry; } ioeventfd_t; diff --git a/meson.build b/meson.build index 03e92674..dba2ba69 100644 --- a/meson.build +++ b/meson.build @@ -18,6 +18,7 @@ opt_tran_pipe = get_option('tran-pipe') opt_debug_logs = get_option('debug-logs') opt_sanitizers = get_option('b_sanitize') opt_debug = get_option('debug') +opt_shadow_ioeventfd = get_option('shadow-ioeventfd') cc = meson.get_compiler('c') @@ -57,6 +58,10 @@ if opt_debug_logs.enabled() or (not opt_debug_logs.disabled() and opt_debug) common_cflags += ['-DDEBUG'] endif +if opt_shadow_ioeventfd + common_cflags += ['-DSHADOW_IOEVENTFD'] +endif + if get_option('warning_level') == '2' # -Wall is set for 'warning_level>=1' # -Wextra is set for 'warning_level>=2' diff --git a/meson_options.txt b/meson_options.txt index 49e4d9ac..5dd2efdb 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -4,3 +4,5 @@ option('tran-pipe', type: 'boolean', value: false, description: 'enable pipe transport for testing') option('debug-logs', type: 'feature', value: 'auto', description: 'enable extra debugging code (default for debug builds)') +option('shadow-ioeventfd', type: 'boolean', value : false, + description: 'enable shadow ioeventfd (experimental)') diff --git a/test/py/libvfio_user.py b/test/py/libvfio_user.py index 0ebf7738..c8525429 100644 --- a/test/py/libvfio_user.py +++ b/test/py/libvfio_user.py @@ -194,6 +194,7 @@ VFIO_USER_IO_FD_TYPE_IOEVENTFD = 0 VFIO_USER_IO_FD_TYPE_IOREGIONFD = 1 +VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW = 2 # enum vfu_dev_irq_type @@ -620,7 +621,7 @@ class vfio_user_migration_info(Structure): lib.vfu_create_ioeventfd.argtypes = (c.c_void_p, c.c_uint32, c.c_int, c.c_size_t, c.c_uint32, c.c_uint32, - c.c_uint64) + c.c_uint64, c.c_int32) lib.vfu_device_quiesced.argtypes = (c.c_void_p, c.c_int) @@ -635,6 +636,10 @@ def to_byte(val): return val.to_bytes(1, 'little') +def to_bytes_le(n, length=1): + return n.to_bytes(length, 'little') + + def skip(fmt, buf): """Return the data remaining after skipping the given elements.""" return buf[struct.calcsize(fmt):] @@ -645,6 +650,9 @@ def parse_json(json_str): return json.loads(json_str, object_hook=lambda d: SimpleNamespace(**d)) +IOEVENT_SIZE = 8 + + def eventfd(initval=0, flags=0): libc.eventfd.argtypes = (c.c_uint, c.c_int) return libc.eventfd(initval, flags) @@ -1184,11 +1192,12 @@ def vfu_sgl_put(ctx, sg, iovec, cnt=1): return lib.vfu_sgl_put(ctx, sg, iovec, cnt) -def vfu_create_ioeventfd(ctx, region_idx, fd, offset, size, flags, datamatch): +def vfu_create_ioeventfd(ctx, region_idx, fd, offset, size, flags, datamatch, + shadow_fd=-1): assert ctx is not None return lib.vfu_create_ioeventfd(ctx, region_idx, fd, offset, size, - flags, datamatch) + flags, datamatch, shadow_fd) def vfu_device_quiesced(ctx, err): diff --git a/test/py/meson.build b/test/py/meson.build index d9c97b3c..0ea9f08b 100644 --- a/test/py/meson.build +++ b/test/py/meson.build @@ -49,6 +49,10 @@ python_tests = [ 'test_vfu_realize_ctx.py', ] +if get_option('shadow-ioeventfd') + python_tests += 'test_shadow_ioeventfd.py' +endif + python_files = python_tests_common + python_tests if pytest.found() and opt_sanitizers == 'none' diff --git a/test/py/test_device_get_region_io_fds.py b/test/py/test_device_get_region_io_fds.py index cb1b7325..924f462e 100644 --- a/test/py/test_device_get_region_io_fds.py +++ b/test/py/test_device_get_region_io_fds.py @@ -36,7 +36,6 @@ ctx = None sock = None fds = [] -IOEVENT_SIZE = 8 def test_device_get_region_io_fds_setup(): diff --git a/test/py/test_pci_caps.py b/test/py/test_pci_caps.py index b83c06c4..01f23a24 100644 --- a/test/py/test_pci_caps.py +++ b/test/py/test_pci_caps.py @@ -349,10 +349,6 @@ def test_pci_cap_write_px(mock_quiesce, mock_reset): expect=errno.EINVAL) -def to_bytes_le(n, length=1): - return n.to_bytes(length, 'little') - - def test_pci_cap_write_msix(): setup_pci_dev(realize=True) sock = connect_client(ctx) diff --git a/test/py/test_shadow_ioeventfd.py b/test/py/test_shadow_ioeventfd.py new file mode 100644 index 00000000..642ad0e9 --- /dev/null +++ b/test/py/test_shadow_ioeventfd.py @@ -0,0 +1,105 @@ +# +# Copyright (c) 2022 Nutanix Inc. All rights reserved. +# +# Authors: Thanos Makatos +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Nutanix nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +# DAMAGE. +# + +from libvfio_user import * +import tempfile +import mmap +import errno + + +def test_shadow_ioeventfd(): + """Configure a shadow ioeventfd, have the client trigger it, confirm that + the server receives the notification and can see the value.""" + + # server setup + ctx = vfu_create_ctx(flags=LIBVFIO_USER_FLAG_ATTACH_NB) + assert ctx is not None + ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_BAR0_REGION_IDX, size=0x1000, + flags=VFU_REGION_FLAG_RW) + assert ret == 0 + fo = tempfile.TemporaryFile(dir="/dev/shm") + fo.truncate(0x1000) + + # FIXME + # Use pip install eventfd? + # $ grep EFD_NONBLOCK -wr /usr/include/ + # /usr/include/bits/eventfd.h: EFD_NONBLOCK = 00004000 + EFD_NONBLOCK = 0o00004000 + + efd = eventfd(flags=EFD_NONBLOCK) + ret = vfu_create_ioeventfd(ctx, VFU_PCI_DEV_BAR0_REGION_IDX, efd, 0x8, + 0x16, 0, 0, shadow_fd=fo.fileno()) + assert ret == 0 + ret = vfu_realize_ctx(ctx) + assert ret == 0 + + # client queries I/O region FDs + sock = connect_client(ctx) + payload = vfio_user_region_io_fds_request( + argsz=len(vfio_user_region_io_fds_reply()) + + len(vfio_user_sub_region_ioeventfd()), flags=0, + index=VFU_PCI_DEV_BAR0_REGION_IDX, count=0) + newfds, ret = msg_fds(ctx, sock, VFIO_USER_DEVICE_GET_REGION_IO_FDS, + payload, expect=0) + reply, ret = vfio_user_region_io_fds_reply.pop_from_buffer(ret) + assert reply.count == 1 # 1 eventfd + ioevent, _ = vfio_user_sub_region_ioeventfd.pop_from_buffer(ret) + assert ioevent.offset == 0x8 + assert ioevent.size == 0x16 + assert ioevent.fd_index == 0 + assert ioevent.type == VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW + assert ioevent.flags == 0 + assert ioevent.datamatch == 0 + + assert len(newfds) == 2 # 2 FDs: eventfd plus shadow FD + cefd = newfds[0] + csfd = newfds[1] + cmem = mmap.mmap(csfd, 0x1000) + + # vfio-user app reads the eventfd, there should be nothing there + try: + os.read(efd, IOEVENT_SIZE) + except BlockingIOError as e: + if e.errno != errno.EAGAIN: + assert False + else: + assert False + + # Client writes to the I/O region. The write to the eventfd would be done + # by KVM and the value would be the same in both cases. + cmem.seek(0x8) + cmem.write(c.c_ulonglong(0xdeadbeef)) + os.write(cefd, c.c_ulonglong(0xcafebabe)) + + # vfio-user app reads eventfd + assert os.read(efd, IOEVENT_SIZE) == to_bytes_le(0xcafebabe, 8) + fo.seek(0x8) + assert fo.read(0x8) == to_bytes_le(0xdeadbeef, 8) + + vfu_destroy_ctx(ctx)