From af4d13842059e8dda4b0d976b65b4bdd185a14f4 Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Tue, 31 May 2022 09:50:18 -0400 Subject: [PATCH 01/11] FIXME support for shadow ioeventfd Signed-off-by: Thanos Makatos Change-Id: Iad849c94076ffa5988e034c8bf7ec312d01f095f --- include/libvfio-user.h | 2 +- include/vfio-user.h | 1 + lib/libvfio-user.c | 16 ++++++++++++++-- lib/private.h | 1 + 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/libvfio-user.h b/include/libvfio-user.h index 84eb2d88..1cfd5e50 100644 --- a/include/libvfio-user.h +++ b/include/libvfio-user.h @@ -1073,7 +1073,7 @@ vfu_sg_is_mappable(vfu_ctx_t *vfu_ctx, dma_sg_t *sg); int vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, size_t offset, uint32_t size, uint32_t flags, - uint64_t datamatch); + uint64_t datamatch, int data_fd); #ifdef __cplusplus } diff --git a/include/vfio-user.h b/include/vfio-user.h index 7439484b..dc6fafaa 100644 --- a/include/vfio-user.h +++ b/include/vfio-user.h @@ -168,6 +168,7 @@ typedef struct vfio_user_region_io_fds_request { #define VFIO_USER_IO_FD_TYPE_IOEVENTFD 0 #define VFIO_USER_IO_FD_TYPE_IOREGIONFD 1 +#define VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW 2 typedef struct vfio_user_sub_region_ioeventfd { uint64_t offset; diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c index ac04d3ba..45215c8f 100644 --- a/lib/libvfio-user.c +++ b/lib/libvfio-user.c @@ -467,7 +467,7 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) EXPORT int vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, size_t offset, uint32_t size, uint32_t flags, - uint64_t datamatch) + uint64_t datamatch, int data_fd) { vfu_reg_info_t *vfu_reg; @@ -494,6 +494,7 @@ vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, elem->size = size; elem->flags = flags; elem->datamatch = datamatch; + elem->data_fd = data_fd; LIST_INSERT_HEAD(&vfu_reg->subregions, elem, entry); return 0; @@ -580,6 +581,7 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) // At least one flag must be set for a valid region. if (!(vfu_reg->flags & VFU_REGION_FLAG_MASK)) { + vfu_log(vfu_ctx, LOG_DEBUG, "no flag for region index %d", req->index); return ERROR_INT(EINVAL); } @@ -627,7 +629,13 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) ioefd->size = sub_reg->size; ioefd->fd_index = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->fd); - ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD; + if (sub_reg->data_fd == -1) { + ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD; + } else { + ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW; + int ret = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->data_fd); + assert(ret == 1); + } ioefd->flags = sub_reg->flags; ioefd->datamatch = sub_reg->datamatch; @@ -1832,6 +1840,10 @@ vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size, assert(vfu_ctx != NULL); + /* + * FIXME should we set a flag in the region to indicate that it has I/O + * region FDs? This way the client won't have to blindly look for them. + */ if ((flags & ~(VFU_REGION_FLAG_MASK)) || (!(flags & VFU_REGION_FLAG_RW))) { vfu_log(vfu_ctx, LOG_ERR, "invalid region flags"); diff --git a/lib/private.h b/lib/private.h index 7ffd6bec..29bc8b85 100644 --- a/lib/private.h +++ b/lib/private.h @@ -186,6 +186,7 @@ typedef struct ioeventfd { int32_t fd; uint32_t flags; uint64_t datamatch; + int32_t data_fd; LIST_ENTRY(ioeventfd) entry; } ioeventfd_t; From c273dcb96a2e695129c91e36bef60dd743088254 Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Tue, 31 May 2022 09:50:18 -0400 Subject: [PATCH 02/11] experimental support for shadow ioeventfd When an ioeventfd is written to, KVM discards the value since it has no memory to write it to, and simply kicks the eventfd. This a problem for devices such a NVMe controllers that need the value (e.g. doorbells on BAR0). This patch allows the vfio-user server to pass a file descriptor that can be mmap'ed and KVM can write the ioeventfd value to this _shadow_ memory instead of discarding it. This shadow memory is not exposed to the guest. Signed-off-by: Thanos Makatos Change-Id: Iad849c94076ffa5988e034c8bf7ec312d01f095f --- include/libvfio-user.h | 10 ++++++++-- lib/libvfio-user.c | 19 ++++++++++--------- lib/private.h | 2 +- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/include/libvfio-user.h b/include/libvfio-user.h index 1cfd5e50..1164e2d8 100644 --- a/include/libvfio-user.h +++ b/include/libvfio-user.h @@ -1069,12 +1069,18 @@ vfu_sg_is_mappable(vfu_ctx_t *vfu_ctx, dma_sg_t *sg); * @size: size of the ioeventfd * @flags: Any flags to set up the ioeventfd * @datamatch: sets the datamatch value + * @shadow_fd: File descriptor than can be mmap'ed, KVM will write there the + * otherwise discarded value when the ioeventfd is written to. If set to -1 + * then a normal ioeventfd is set up instead of a shadow one. Then vfio-user + * client is free to ignore this, even if it supports shadow ioeventfds. + * Requires a kernel with shadow ioeventfd support. + * Experimental, must be compiled with SHADOW_IOEVENTFD defined, otherwise + * must be -1. */ int vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, size_t offset, uint32_t size, uint32_t flags, - uint64_t datamatch, int data_fd); - + uint64_t datamatch, int shadow_fd); #ifdef __cplusplus } #endif diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c index 45215c8f..2c510f09 100644 --- a/lib/libvfio-user.c +++ b/lib/libvfio-user.c @@ -467,13 +467,19 @@ handle_device_get_region_info(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) EXPORT int vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, size_t offset, uint32_t size, uint32_t flags, - uint64_t datamatch, int data_fd) + uint64_t datamatch, int shadow_fd) { vfu_reg_info_t *vfu_reg; assert(vfu_ctx != NULL); assert(fd >= 0); +#ifndef SHADOW_IOEVENTFD + if (shadow_fd != -1) { + return ERROR_INT(EINVAL); + } +#endif + if (region_idx >= VFU_PCI_DEV_NUM_REGIONS) { return ERROR_INT(EINVAL); } @@ -494,7 +500,7 @@ vfu_create_ioeventfd(vfu_ctx_t *vfu_ctx, uint32_t region_idx, int fd, elem->size = size; elem->flags = flags; elem->datamatch = datamatch; - elem->data_fd = data_fd; + elem->shadow_fd = shadow_fd; LIST_INSERT_HEAD(&vfu_reg->subregions, elem, entry); return 0; @@ -581,7 +587,6 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) // At least one flag must be set for a valid region. if (!(vfu_reg->flags & VFU_REGION_FLAG_MASK)) { - vfu_log(vfu_ctx, LOG_DEBUG, "no flag for region index %d", req->index); return ERROR_INT(EINVAL); } @@ -629,11 +634,11 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) ioefd->size = sub_reg->size; ioefd->fd_index = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->fd); - if (sub_reg->data_fd == -1) { + if (sub_reg->shadow_fd == -1) { ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD; } else { ioefd->type = VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW; - int ret = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->data_fd); + int ret = add_fd_index(msg->out.fds, &msg->out.nr_fds, sub_reg->shadow_fd); assert(ret == 1); } ioefd->flags = sub_reg->flags; @@ -1840,10 +1845,6 @@ vfu_setup_region(vfu_ctx_t *vfu_ctx, int region_idx, size_t size, assert(vfu_ctx != NULL); - /* - * FIXME should we set a flag in the region to indicate that it has I/O - * region FDs? This way the client won't have to blindly look for them. - */ if ((flags & ~(VFU_REGION_FLAG_MASK)) || (!(flags & VFU_REGION_FLAG_RW))) { vfu_log(vfu_ctx, LOG_ERR, "invalid region flags"); diff --git a/lib/private.h b/lib/private.h index 29bc8b85..b8751385 100644 --- a/lib/private.h +++ b/lib/private.h @@ -186,7 +186,7 @@ typedef struct ioeventfd { int32_t fd; uint32_t flags; uint64_t datamatch; - int32_t data_fd; + int32_t shadow_fd; LIST_ENTRY(ioeventfd) entry; } ioeventfd_t; From 0ecdd2697f58d22b5bb114151dea42843c37e4b1 Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Thu, 23 Jun 2022 21:04:28 +0000 Subject: [PATCH 03/11] fix broken unit tests Signed-off-by: Thanos Makatos Change-Id: Idedbdde5e6315e5ce568cc14d4d6710d8a9af094 --- test/py/libvfio_user.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/py/libvfio_user.py b/test/py/libvfio_user.py index 0ebf7738..5cdd1e12 100644 --- a/test/py/libvfio_user.py +++ b/test/py/libvfio_user.py @@ -620,7 +620,7 @@ class vfio_user_migration_info(Structure): lib.vfu_create_ioeventfd.argtypes = (c.c_void_p, c.c_uint32, c.c_int, c.c_size_t, c.c_uint32, c.c_uint32, - c.c_uint64) + c.c_uint64, c.c_int32) lib.vfu_device_quiesced.argtypes = (c.c_void_p, c.c_int) @@ -1184,11 +1184,12 @@ def vfu_sgl_put(ctx, sg, iovec, cnt=1): return lib.vfu_sgl_put(ctx, sg, iovec, cnt) -def vfu_create_ioeventfd(ctx, region_idx, fd, offset, size, flags, datamatch): +def vfu_create_ioeventfd(ctx, region_idx, fd, offset, size, flags, datamatch, + shadow_fd=-1): assert ctx is not None return lib.vfu_create_ioeventfd(ctx, region_idx, fd, offset, size, - flags, datamatch) + flags, datamatch, shadow_fd) def vfu_device_quiesced(ctx, err): From b26918a68f2f62234ea68eb90c00910518060276 Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Fri, 1 Jul 2022 11:22:35 +0000 Subject: [PATCH 04/11] add unit test Signed-off-by: Thanos Makatos --- meson.build | 5 ++ meson_options.txt | 2 + test/py/libvfio_user.py | 6 ++ test/py/meson.build | 5 ++ test/py/test_device_get_region_io_fds.py | 1 - test/py/test_pci_caps.py | 4 - test/py/test_shadow_ioeventfd.py | 94 ++++++++++++++++++++++++ 7 files changed, 112 insertions(+), 5 deletions(-) create mode 100644 test/py/test_shadow_ioeventfd.py diff --git a/meson.build b/meson.build index 03e92674..dba2ba69 100644 --- a/meson.build +++ b/meson.build @@ -18,6 +18,7 @@ opt_tran_pipe = get_option('tran-pipe') opt_debug_logs = get_option('debug-logs') opt_sanitizers = get_option('b_sanitize') opt_debug = get_option('debug') +opt_shadow_ioeventfd = get_option('shadow-ioeventfd') cc = meson.get_compiler('c') @@ -57,6 +58,10 @@ if opt_debug_logs.enabled() or (not opt_debug_logs.disabled() and opt_debug) common_cflags += ['-DDEBUG'] endif +if opt_shadow_ioeventfd + common_cflags += ['-DSHADOW_IOEVENTFD'] +endif + if get_option('warning_level') == '2' # -Wall is set for 'warning_level>=1' # -Wextra is set for 'warning_level>=2' diff --git a/meson_options.txt b/meson_options.txt index 49e4d9ac..52973d65 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -4,3 +4,5 @@ option('tran-pipe', type: 'boolean', value: false, description: 'enable pipe transport for testing') option('debug-logs', type: 'feature', value: 'auto', description: 'enable extra debugging code (default for debug builds)') +option('shadow-ioeventfd', type: 'boolean', value : true, + description: 'enable shadow ioeventfd (experimental)') diff --git a/test/py/libvfio_user.py b/test/py/libvfio_user.py index 5cdd1e12..2898c996 100644 --- a/test/py/libvfio_user.py +++ b/test/py/libvfio_user.py @@ -635,6 +635,10 @@ def to_byte(val): return val.to_bytes(1, 'little') +def to_bytes_le(n, length=1): + return n.to_bytes(length, 'little') + + def skip(fmt, buf): """Return the data remaining after skipping the given elements.""" return buf[struct.calcsize(fmt):] @@ -645,6 +649,8 @@ def parse_json(json_str): return json.loads(json_str, object_hook=lambda d: SimpleNamespace(**d)) +IOEVENT_SIZE = 8 + def eventfd(initval=0, flags=0): libc.eventfd.argtypes = (c.c_uint, c.c_int) return libc.eventfd(initval, flags) diff --git a/test/py/meson.build b/test/py/meson.build index d9c97b3c..6ebd7d3f 100644 --- a/test/py/meson.build +++ b/test/py/meson.build @@ -49,6 +49,11 @@ python_tests = [ 'test_vfu_realize_ctx.py', ] +opt_shadow_ioeventfd = get_option('shadow-ioeventfd') +if opt_shadow_ioeventfd + python_tests += 'test_shadow_ioeventfd.py' +endif + python_files = python_tests_common + python_tests if pytest.found() and opt_sanitizers == 'none' diff --git a/test/py/test_device_get_region_io_fds.py b/test/py/test_device_get_region_io_fds.py index cb1b7325..924f462e 100644 --- a/test/py/test_device_get_region_io_fds.py +++ b/test/py/test_device_get_region_io_fds.py @@ -36,7 +36,6 @@ ctx = None sock = None fds = [] -IOEVENT_SIZE = 8 def test_device_get_region_io_fds_setup(): diff --git a/test/py/test_pci_caps.py b/test/py/test_pci_caps.py index b83c06c4..01f23a24 100644 --- a/test/py/test_pci_caps.py +++ b/test/py/test_pci_caps.py @@ -349,10 +349,6 @@ def test_pci_cap_write_px(mock_quiesce, mock_reset): expect=errno.EINVAL) -def to_bytes_le(n, length=1): - return n.to_bytes(length, 'little') - - def test_pci_cap_write_msix(): setup_pci_dev(realize=True) sock = connect_client(ctx) diff --git a/test/py/test_shadow_ioeventfd.py b/test/py/test_shadow_ioeventfd.py new file mode 100644 index 00000000..f762eb2d --- /dev/null +++ b/test/py/test_shadow_ioeventfd.py @@ -0,0 +1,94 @@ +# +# Copyright (c) 2022 Nutanix Inc. All rights reserved. +# +# Authors: Thanos Makatos +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Nutanix nor the names of its contributors may be +# used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY +# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +# DAMAGE. +# + +from libvfio_user import * +import tempfile +import mmap +import errno + +def test_shadow_ioeventfd(): + """Configure a shadow ioeventfd, have the client trigger it, confirm that + the server receives the notification and can see the value.""" + + # server setup + ctx = vfu_create_ctx(flags=LIBVFIO_USER_FLAG_ATTACH_NB) + assert ctx is not None + ret = vfu_setup_region(ctx, index=VFU_PCI_DEV_BAR0_REGION_IDX, size=0x1000, + flags=VFU_REGION_FLAG_RW) + assert ret == 0 + fo = tempfile.TemporaryFile(dir="/dev/shm") + fo.truncate(0x1000) + + # FIXME + # Use pip install eventfd? + # $ grep EFD_NONBLOCK -wr /usr/include/ + # /usr/include/bits/eventfd.h: EFD_NONBLOCK = 00004000 + EFD_NONBLOCK = 0o00004000 + + efd = eventfd(flags=EFD_NONBLOCK) + ret = vfu_create_ioeventfd(ctx, VFU_PCI_DEV_BAR0_REGION_IDX, efd, 0x8, + 0x16, 0, 0, shadow_fd=fo.fileno()) + assert ret == 0 + ret = vfu_realize_ctx(ctx) + assert ret == 0 + + # client queries I/O region FDs + sock = connect_client(ctx) + payload = vfio_user_region_io_fds_request( + argsz=len(vfio_user_region_io_fds_reply()) + + len(vfio_user_sub_region_ioeventfd()), flags=0, + index=VFU_PCI_DEV_BAR0_REGION_IDX, count=0) + newfds, ret = msg_fds(ctx, sock, VFIO_USER_DEVICE_GET_REGION_IO_FDS, + payload, expect=0) + reply, ret = vfio_user_region_io_fds_reply.pop_from_buffer(ret) + assert reply.count == 1 # 1 eventfd + assert len(newfds) == 2 # 2 FDs: eventfd plus shadow FD + cefd = newfds[0] + csfd = newfds[1] + cmem = mmap.mmap(csfd, 0x1000) # FIXME must get length from server response + + # vfio-user app reads the eventfd, there should be nothing there + try: + os.read(efd, IOEVENT_SIZE) + except BlockingIOError as e: + if e.errno != errno.EAGAIN: + assert False + else: + assert False + + # Client writes to the I/O region. Tthe write to the eventfd would by done + # by KVM and the value would be the same in both cases. + cmem.seek(0x8) + cmem.write(c.c_ulonglong(0xdeadbeef)) + os.write(cefd, c.c_ulonglong(0xcafebabe)) + + # vfio-user app reads eventfd + assert os.read(efd, IOEVENT_SIZE) == to_bytes_le(0xcafebabe, 8) + fo.seek(0x8) + assert fo.read(0x8) == to_bytes_le(0xdeadbeef, 8) From dcc96382adeded7fcad9182b2650b5f9dd4c9259 Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Fri, 1 Jul 2022 12:37:38 +0000 Subject: [PATCH 05/11] more test fixes Signed-off-by: Thanos Makatos --- test/py/libvfio_user.py | 2 ++ test/py/test_shadow_ioeventfd.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/test/py/libvfio_user.py b/test/py/libvfio_user.py index 2898c996..c8525429 100644 --- a/test/py/libvfio_user.py +++ b/test/py/libvfio_user.py @@ -194,6 +194,7 @@ VFIO_USER_IO_FD_TYPE_IOEVENTFD = 0 VFIO_USER_IO_FD_TYPE_IOREGIONFD = 1 +VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW = 2 # enum vfu_dev_irq_type @@ -651,6 +652,7 @@ def parse_json(json_str): IOEVENT_SIZE = 8 + def eventfd(initval=0, flags=0): libc.eventfd.argtypes = (c.c_uint, c.c_int) return libc.eventfd(initval, flags) diff --git a/test/py/test_shadow_ioeventfd.py b/test/py/test_shadow_ioeventfd.py index f762eb2d..9a11db7f 100644 --- a/test/py/test_shadow_ioeventfd.py +++ b/test/py/test_shadow_ioeventfd.py @@ -32,6 +32,7 @@ import mmap import errno + def test_shadow_ioeventfd(): """Configure a shadow ioeventfd, have the client trigger it, confirm that the server receives the notification and can see the value.""" @@ -67,11 +68,19 @@ def test_shadow_ioeventfd(): newfds, ret = msg_fds(ctx, sock, VFIO_USER_DEVICE_GET_REGION_IO_FDS, payload, expect=0) reply, ret = vfio_user_region_io_fds_reply.pop_from_buffer(ret) - assert reply.count == 1 # 1 eventfd - assert len(newfds) == 2 # 2 FDs: eventfd plus shadow FD + assert reply.count == 1 # 1 eventfd + ioevent, _ = vfio_user_sub_region_ioeventfd.pop_from_buffer(ret) + assert ioevent.offset == 0x8 + assert ioevent.size == 0x16 + assert ioevent.fd_index == 0 + assert ioevent.type == VFIO_USER_IO_FD_TYPE_IOEVENTFD_SHADOW + assert ioevent.flags == 0 + assert ioevent.datamatch == 0 + + assert len(newfds) == 2 # 2 FDs: eventfd plus shadow FD cefd = newfds[0] csfd = newfds[1] - cmem = mmap.mmap(csfd, 0x1000) # FIXME must get length from server response + cmem = mmap.mmap(csfd, 0x1000) # vfio-user app reads the eventfd, there should be nothing there try: From d99f321c963a2b8b59532865684ff0e54ec6f1b8 Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Fri, 1 Jul 2022 15:03:17 +0000 Subject: [PATCH 06/11] don't write to unallocated memory Signed-off-by: Thanos Makatos --- lib/libvfio-user.c | 6 +++++- test/py/test_shadow_ioeventfd.py | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c index 2c510f09..f9dd1024 100644 --- a/lib/libvfio-user.c +++ b/lib/libvfio-user.c @@ -562,6 +562,7 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) ioeventfd_t *sub_reg = NULL; size_t nr_sub_reg = 0; size_t i = 0; + size_t nr_shdw_reg = 0; assert(vfu_ctx != NULL); assert(msg != NULL); @@ -592,6 +593,9 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) LIST_FOREACH(sub_reg, &vfu_reg->subregions, entry) { nr_sub_reg++; + if (sub_reg->shadow_fd != -1) { + nr_shdw_reg++; + } } if (req->argsz < sizeof(vfio_user_region_io_fds_reply_t) || @@ -621,7 +625,7 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) msg->out.nr_fds = 0; if (req->argsz >= reply->argsz) { - msg->out.fds = calloc(sizeof(int), max_sent_sub_regions); + msg->out.fds = calloc(sizeof(int), max_sent_sub_regions + nr_shdw_reg); if (msg->out.fds == NULL) { return -1; } diff --git a/test/py/test_shadow_ioeventfd.py b/test/py/test_shadow_ioeventfd.py index 9a11db7f..9187fb60 100644 --- a/test/py/test_shadow_ioeventfd.py +++ b/test/py/test_shadow_ioeventfd.py @@ -101,3 +101,5 @@ def test_shadow_ioeventfd(): assert os.read(efd, IOEVENT_SIZE) == to_bytes_le(0xcafebabe, 8) fo.seek(0x8) assert fo.read(0x8) == to_bytes_le(0xdeadbeef, 8) + + vfu_destroy_ctx(ctx) From 631d1b85b7410cd996cd3a003937b246b1393246 Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Mon, 4 Jul 2022 09:29:16 +0000 Subject: [PATCH 07/11] add doc for ioregionfd Signed-off-by: Thanos Makatos --- docs/ioregionfd.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 docs/ioregionfd.md diff --git a/docs/ioregionfd.md b/docs/ioregionfd.md new file mode 100644 index 00000000..d37b74af --- /dev/null +++ b/docs/ioregionfd.md @@ -0,0 +1,16 @@ +# ioregionfd + +ioregionfd is a mechanism that speeds up ioeventfds: +https://lore.kernel.org/kvm/cover.1613828726.git.eafanasova@gmail.com/. In the +author's original words: "ioregionfd is a KVM dispatch mechanism which can be +used for handling MMIO/PIO accesses over file descriptors without returning +from ioctl(KVM_RUN).". + +libvfio-user currently supports an experimental variant of this mechanism +called shadow ioeventfd. A shadow ioeventfd is a normal ioeventfd where the +vfio-user server passes another piece of memory (called the _shadow_ memory) +via an additional file descriptor when configuring the ioregionfd, which then +QEMU memory maps and passes this address to KVM. This shadow memory is never +exposed to the guest. When the guest writes to the trapped memory, KVM writes +the value to the shadow memory instread of discarding it, and then proceeds +kicking the evetnfd as normal. From 839934e23bd646406af922fa793961ad183878c2 Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Mon, 4 Jul 2022 09:35:02 +0000 Subject: [PATCH 08/11] address John's comments --- include/libvfio-user.h | 4 ++-- lib/libvfio-user.c | 7 ++++--- test/py/meson.build | 3 +-- test/py/test_shadow_ioeventfd.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/libvfio-user.h b/include/libvfio-user.h index 1164e2d8..d12d89f0 100644 --- a/include/libvfio-user.h +++ b/include/libvfio-user.h @@ -1069,9 +1069,9 @@ vfu_sg_is_mappable(vfu_ctx_t *vfu_ctx, dma_sg_t *sg); * @size: size of the ioeventfd * @flags: Any flags to set up the ioeventfd * @datamatch: sets the datamatch value - * @shadow_fd: File descriptor than can be mmap'ed, KVM will write there the + * @shadow_fd: File descriptor that can be mmap'ed, KVM will write there the * otherwise discarded value when the ioeventfd is written to. If set to -1 - * then a normal ioeventfd is set up instead of a shadow one. Then vfio-user + * then a normal ioeventfd is set up instead of a shadow one. The vfio-user * client is free to ignore this, even if it supports shadow ioeventfds. * Requires a kernel with shadow ioeventfd support. * Experimental, must be compiled with SHADOW_IOEVENTFD defined, otherwise diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c index f9dd1024..5ce57679 100644 --- a/lib/libvfio-user.c +++ b/lib/libvfio-user.c @@ -562,7 +562,7 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) ioeventfd_t *sub_reg = NULL; size_t nr_sub_reg = 0; size_t i = 0; - size_t nr_shdw_reg = 0; + size_t nr_shadow_reg = 0; assert(vfu_ctx != NULL); assert(msg != NULL); @@ -594,7 +594,7 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) LIST_FOREACH(sub_reg, &vfu_reg->subregions, entry) { nr_sub_reg++; if (sub_reg->shadow_fd != -1) { - nr_shdw_reg++; + nr_shadow_reg++; } } @@ -625,7 +625,8 @@ handle_device_get_region_io_fds(vfu_ctx_t *vfu_ctx, vfu_msg_t *msg) msg->out.nr_fds = 0; if (req->argsz >= reply->argsz) { - msg->out.fds = calloc(sizeof(int), max_sent_sub_regions + nr_shdw_reg); + msg->out.fds = calloc(sizeof(int), + max_sent_sub_regions + nr_shadow_reg); if (msg->out.fds == NULL) { return -1; } diff --git a/test/py/meson.build b/test/py/meson.build index 6ebd7d3f..0ea9f08b 100644 --- a/test/py/meson.build +++ b/test/py/meson.build @@ -49,8 +49,7 @@ python_tests = [ 'test_vfu_realize_ctx.py', ] -opt_shadow_ioeventfd = get_option('shadow-ioeventfd') -if opt_shadow_ioeventfd +if get_option('shadow-ioeventfd') python_tests += 'test_shadow_ioeventfd.py' endif diff --git a/test/py/test_shadow_ioeventfd.py b/test/py/test_shadow_ioeventfd.py index 9187fb60..642ad0e9 100644 --- a/test/py/test_shadow_ioeventfd.py +++ b/test/py/test_shadow_ioeventfd.py @@ -91,7 +91,7 @@ def test_shadow_ioeventfd(): else: assert False - # Client writes to the I/O region. Tthe write to the eventfd would by done + # Client writes to the I/O region. The write to the eventfd would be done # by KVM and the value would be the same in both cases. cmem.seek(0x8) cmem.write(c.c_ulonglong(0xdeadbeef)) From 54dc67461f0b4b14e1ab5001e88f6d76829c2cd4 Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Mon, 4 Jul 2022 09:36:18 +0000 Subject: [PATCH 09/11] disable shadow ioeventfd by default Signed-off-by: Thanos Makatos --- meson_options.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson_options.txt b/meson_options.txt index 52973d65..5dd2efdb 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -4,5 +4,5 @@ option('tran-pipe', type: 'boolean', value: false, description: 'enable pipe transport for testing') option('debug-logs', type: 'feature', value: 'auto', description: 'enable extra debugging code (default for debug builds)') -option('shadow-ioeventfd', type: 'boolean', value : true, +option('shadow-ioeventfd', type: 'boolean', value : false, description: 'enable shadow ioeventfd (experimental)') From 146e30c71ae2d992741f15f7c0c96b1a5f277fce Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Mon, 4 Jul 2022 10:09:57 +0000 Subject: [PATCH 10/11] fix typo Signed-off-by: Thanos Makatos --- docs/ioregionfd.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ioregionfd.md b/docs/ioregionfd.md index d37b74af..fb13d324 100644 --- a/docs/ioregionfd.md +++ b/docs/ioregionfd.md @@ -13,4 +13,4 @@ via an additional file descriptor when configuring the ioregionfd, which then QEMU memory maps and passes this address to KVM. This shadow memory is never exposed to the guest. When the guest writes to the trapped memory, KVM writes the value to the shadow memory instread of discarding it, and then proceeds -kicking the evetnfd as normal. +kicking the eventfd as normal. From c270d89e6498bd17f175487c620ca03f72864f5c Mon Sep 17 00:00:00 2001 From: Thanos Makatos Date: Mon, 4 Jul 2022 10:39:36 +0000 Subject: [PATCH 11/11] add patches for other components for shadow ioeventfd Signed-off-by: Thanos Makatos --- docs/ioregionfd.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/ioregionfd.md b/docs/ioregionfd.md index fb13d324..c09b077c 100644 --- a/docs/ioregionfd.md +++ b/docs/ioregionfd.md @@ -14,3 +14,12 @@ QEMU memory maps and passes this address to KVM. This shadow memory is never exposed to the guest. When the guest writes to the trapped memory, KVM writes the value to the shadow memory instread of discarding it, and then proceeds kicking the eventfd as normal. + +To use shadow ioeventfd, the kernel and QEMU need to be patched. The QEMU patch +is designed specifically for SPDK's doorbells (one ioregionfd of 4K in BAR0); +it should be trivial to extend. + +The list of patches: +* kernel: https://gist.github.com/tmakatos/532afd092a8df2175120d3dbfcd719ef +* QEMU: https://gist.github.com/tmakatos/57755d2a37a6d53c9ff392e7c34470f6 +* SPDK: https://gist.github.com/tmakatos/f6c10fdaff59c9d629f94bd8e44a53bc